diff options
author | Teresa Charlin <teresa.charlinreyes@arm.com> | 2023-08-17 18:44:58 +0100 |
---|---|---|
committer | Teresa Charlin <teresa.charlinreyes@arm.com> | 2023-08-28 12:37:25 +0100 |
commit | 9145e38edf49fa4862008c163c34590141eecb14 (patch) | |
tree | 64706ef579f548b804d5b674b33f6b239c638d0f | |
parent | e40cc8359b02a7786908294300c45b672cf6b0e4 (diff) | |
download | armnn-9145e38edf49fa4862008c163c34590141eecb14.tar.gz |
IVGCVSW-7505 Create FusedLayer and NeonFusedWorkload for AddMulAdd Neon kernel
Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: Ic778d35b001474b44fb1e433a6fe276e4ec9f565
34 files changed, 637 insertions, 18 deletions
diff --git a/Android.mk b/Android.mk index c32afbeb34..e4cb59c2f4 100644 --- a/Android.mk +++ b/Android.mk @@ -233,6 +233,7 @@ LOCAL_SRC_FILES := \ src/armnn/layers/FillLayer.cpp \ src/armnn/layers/FloorLayer.cpp \ src/armnn/layers/FullyConnectedLayer.cpp \ + src/armnn/layers/FusedLayer.cpp \ src/armnn/layers/GatherLayer.cpp \ src/armnn/layers/GatherNdLayer.cpp \ src/armnn/layers/InputLayer.cpp \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 42da39b8a8..91561b77d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,6 +240,8 @@ list(APPEND armnn_sources src/armnn/layers/FloorLayer.cpp src/armnn/layers/FullyConnectedLayer.hpp src/armnn/layers/FullyConnectedLayer.cpp + src/armnn/layers/FusedLayer.hpp + src/armnn/layers/FusedLayer.cpp src/armnn/layers/GatherLayer.cpp src/armnn/layers/GatherLayer.hpp src/armnn/layers/GatherNdLayer.cpp diff --git a/include/armnn/BackendHelper.hpp b/include/armnn/BackendHelper.hpp index 59cbbfced3..986f854636 100644 --- a/include/armnn/BackendHelper.hpp +++ b/include/armnn/BackendHelper.hpp @@ -194,6 +194,11 @@ public: const FullyConnectedDescriptor& descriptor, Optional<std::string&> reasonIfUnsupported = EmptyOptional()); + bool IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs, + const std::vector<std::reference_wrapper<TensorInfo>>& outputs, + const FusedDescriptor& descriptor, + Optional<std::string&> reasonIfUnsupported = EmptyOptional()); + bool IsGatherSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/include/armnn/Descriptors.hpp b/include/armnn/Descriptors.hpp index f60e8f3bea..30eaefd83b 100644 --- a/include/armnn/Descriptors.hpp +++ b/include/armnn/Descriptors.hpp @@ -940,6 +940,27 @@ struct FillDescriptor : BaseDescriptor float m_Value; }; +/// A FusedDescriptor for the FusedLayer. +struct FusedDescriptor : BaseDescriptor +{ + FusedDescriptor(unsigned int numInputSlots = 4u, + unsigned int numOutputSlots = 2u, + FusedKernelType fusedType = FusedKernelType::AddMulAdd) + : m_NumInputSlots(numInputSlots), m_NumOutputSlots(numOutputSlots), m_FusedKernelType(fusedType) + {} + + bool operator ==(const FusedDescriptor& rhs) const + { + return m_NumInputSlots == rhs.m_NumInputSlots && + m_NumOutputSlots == rhs.m_NumOutputSlots && + m_FusedKernelType == rhs.m_FusedKernelType; + } + + unsigned int m_NumInputSlots; + unsigned int m_NumOutputSlots; + FusedKernelType m_FusedKernelType; +}; + /// A GatherDescriptor for the GatherLayer. struct GatherDescriptor : BaseDescriptor { diff --git a/include/armnn/DescriptorsFwd.hpp b/include/armnn/DescriptorsFwd.hpp index be1a3f6782..4b9a3e5060 100644 --- a/include/armnn/DescriptorsFwd.hpp +++ b/include/armnn/DescriptorsFwd.hpp @@ -25,6 +25,7 @@ struct ElementwiseUnaryDescriptor; struct FakeQuantizationDescriptor; struct FillDescriptor; struct FullyConnectedDescriptor; +struct FusedDescriptor; struct GatherDescriptor; struct InstanceNormalizationDescriptor; struct L2NormalizationDescriptor; diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp index 1640d7c37d..c2c76e3d97 100644 --- a/include/armnn/INetwork.hpp +++ b/include/armnn/INetwork.hpp @@ -477,6 +477,14 @@ public: IConnectableLayer* AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, const char* name = nullptr); + /// Adds a Fused layer to the network. + /// Method use is for backend users. + /// @param fusedDescriptor - FusedDescriptor contains parameters for the Fused layer. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. + IConnectableLayer* AddFusedLayer(const FusedDescriptor& fusedDescriptor, + const char* name = nullptr); + /// Adds a permute layer to the network. /// @param permuteDescriptor - PermuteDescriptor to configure the permute. /// @param name - Optional name for the layer. diff --git a/include/armnn/Types.hpp b/include/armnn/Types.hpp index bf4458ee7f..7cb3a859c7 100644 --- a/include/armnn/Types.hpp +++ b/include/armnn/Types.hpp @@ -262,6 +262,11 @@ enum class MemBlockStrategyType MultiAxisPacking = 1 }; +enum class FusedKernelType +{ + AddMulAdd = 0 +}; + /// Each backend should implement an IBackend. class IBackend { @@ -475,6 +480,7 @@ using InferenceTimingPair = std::pair<HighResolutionClock, HighResolutionClock>; X(ElementwiseBinary) \ X(ReverseV2) \ X(Tile) \ + X(Fused) \ // New layers should be added at last position to minimize instability. @@ -486,7 +492,7 @@ enum class LayerType LIST_OF_LAYER_TYPE #undef X FirstLayer = Activation, - LastLayer = Tile + LastLayer = Fused }; const char* GetLayerTypeAsCString(LayerType type); diff --git a/include/armnn/TypesUtils.hpp b/include/armnn/TypesUtils.hpp index eeb5c9e614..ca098f60fb 100644 --- a/include/armnn/TypesUtils.hpp +++ b/include/armnn/TypesUtils.hpp @@ -115,6 +115,15 @@ constexpr char const* GetLogicalBinaryOperationAsCString(LogicalBinaryOperation } } +constexpr char const* GetFusedTypeAsCString(FusedKernelType type) +{ + switch (type) + { + case FusedKernelType::AddMulAdd: return "AddMulAdd"; + default: return "Unknown"; + } +} + constexpr char const* GetPoolingAlgorithmAsCString(PoolingAlgorithm pooling) { switch (pooling) diff --git a/include/armnn/backends/WorkloadData.hpp b/include/armnn/backends/WorkloadData.hpp index 21a597df8a..86796cbcc0 100644 --- a/include/armnn/backends/WorkloadData.hpp +++ b/include/armnn/backends/WorkloadData.hpp @@ -182,6 +182,11 @@ struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters<FullyConnec void Validate(const WorkloadInfo& workloadInfo) const; }; +struct FusedQueueDescriptor : QueueDescriptorWithParameters<FusedDescriptor> +{ + void Validate(const WorkloadInfo& workloadInfo) const; +}; + // Permute layer workload data. struct PermuteQueueDescriptor : QueueDescriptorWithParameters<PermuteDescriptor> { diff --git a/src/armnn/BackendHelper.cpp b/src/armnn/BackendHelper.cpp index f025193006..fc7a2fab83 100644 --- a/src/armnn/BackendHelper.cpp +++ b/src/armnn/BackendHelper.cpp @@ -748,6 +748,30 @@ bool LayerSupportHandle::IsFullyConnectedSupported(const TensorInfo& input, reasonIfUnsupported); } +bool LayerSupportHandle::IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs, + const std::vector<std::reference_wrapper<TensorInfo>>& outputs, + const FusedDescriptor& descriptor, + Optional<std::string&> reasonIfUnsupported) +{ + TensorInfos infos; + infos.reserve(inputs.size() + outputs.size()); + for (TensorInfo inInfo : inputs) + { + infos.emplace_back(inInfo); + } + for (TensorInfo outInfo : outputs) + { + infos.emplace_back(outInfo); + } + + return m_LayerSupport->IsLayerSupported(LayerType::Fused, + infos, + descriptor, + EmptyOptional(), + EmptyOptional(), + reasonIfUnsupported); +} + bool LayerSupportHandle::IsGatherSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp index 743b8d7205..f83b710134 100644 --- a/src/armnn/LayersFwd.hpp +++ b/src/armnn/LayersFwd.hpp @@ -33,6 +33,7 @@ #include "layers/FillLayer.hpp" #include "layers/FloorLayer.hpp" #include "layers/FullyConnectedLayer.hpp" +#include "layers/FusedLayer.hpp" #include "layers/GatherLayer.hpp" #include "layers/GatherNdLayer.hpp" #include "layers/InputLayer.hpp" @@ -136,6 +137,7 @@ DECLARE_LAYER(FakeQuantization) DECLARE_LAYER(Fill) DECLARE_LAYER(Floor) DECLARE_LAYER(FullyConnected) +DECLARE_LAYER(Fused) DECLARE_LAYER(Gather) DECLARE_LAYER(GatherNd) DECLARE_LAYER(Input) diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index 41111476da..7f4ef6b1b6 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -335,6 +335,12 @@ IConnectableLayer* INetwork::AddFullyConnectedLayer(const FullyConnectedDescript return pNetworkImpl->AddFullyConnectedLayer(fullyConnectedDescriptor, name); } +IConnectableLayer* INetwork::AddFusedLayer(const FusedDescriptor& fusedDescriptor, + const char* name) +{ + return pNetworkImpl->AddFusedLayer(fusedDescriptor, name); +} + IConnectableLayer* INetwork::AddPermuteLayer(const PermuteDescriptor& permuteDescriptor, const char* name) { @@ -2195,6 +2201,12 @@ IConnectableLayer* NetworkImpl::AddFullyConnectedLayer(const FullyConnectedDescr return m_Graph->AddLayer<FullyConnectedLayer>(fullyConnectedDescriptor, name); } +IConnectableLayer* NetworkImpl::AddFusedLayer(const FusedDescriptor& fusedDescriptor, + const char* name) +{ + return m_Graph->AddLayer<FusedLayer>(fusedDescriptor, name); +} + IConnectableLayer* NetworkImpl::AddConcatLayer(const ConcatDescriptor& concatDescriptor, const char* name) { diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp index 34549248bc..5a3570d825 100644 --- a/src/armnn/Network.hpp +++ b/src/armnn/Network.hpp @@ -113,6 +113,9 @@ public: IConnectableLayer* AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, const char* name = nullptr); + IConnectableLayer* AddFusedLayer(const FusedDescriptor& fusedDescriptor, + const char* name = nullptr); + IConnectableLayer* AddGatherLayer(const GatherDescriptor& gatherDescriptor, const char* name = nullptr); diff --git a/src/armnn/SerializeLayerParameters.cpp b/src/armnn/SerializeLayerParameters.cpp index d65a7d55fa..cc59e1fad3 100644 --- a/src/armnn/SerializeLayerParameters.cpp +++ b/src/armnn/SerializeLayerParameters.cpp @@ -325,6 +325,15 @@ void StringifyLayerParameters<PreCompiledDescriptor>::Serialize(ParameterStringi fn("NumOutputSlots", std::to_string(desc.m_NumOutputSlots)); } +void StringifyLayerParameters<FusedDescriptor>::Serialize(ParameterStringifyFunction& fn, + const FusedDescriptor& desc) +{ + fn("NumInputSlots", std::to_string(desc.m_NumInputSlots)); + fn("NumOutputSlots", std::to_string(desc.m_NumOutputSlots)); + fn("PaddingMode", GetFusedTypeAsCString(desc.m_FusedKernelType)); + +} + void StringifyLayerParameters<Pooling2dDescriptor>::Serialize(ParameterStringifyFunction& fn, const Pooling2dDescriptor& desc) { diff --git a/src/armnn/SerializeLayerParameters.hpp b/src/armnn/SerializeLayerParameters.hpp index 5b0378eab7..34a2986534 100644 --- a/src/armnn/SerializeLayerParameters.hpp +++ b/src/armnn/SerializeLayerParameters.hpp @@ -149,6 +149,11 @@ template <> struct StringifyLayerParameters<PreCompiledDescriptor> static void Serialize(ParameterStringifyFunction& fn, const PreCompiledDescriptor& desc); }; +template <> struct StringifyLayerParameters<FusedDescriptor> +{ + static void Serialize(ParameterStringifyFunction& fn, const FusedDescriptor& desc); +}; + template <> struct StringifyLayerParameters<ReduceDescriptor> { static void Serialize(ParameterStringifyFunction& fn, const ReduceDescriptor& desc); diff --git a/src/armnn/layers/FusedLayer.cpp b/src/armnn/layers/FusedLayer.cpp new file mode 100644 index 0000000000..37b1835450 --- /dev/null +++ b/src/armnn/layers/FusedLayer.cpp @@ -0,0 +1,48 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "FusedLayer.hpp" +#include "LayerCloneBase.hpp" + +#include <armnn/backends/Workload.hpp> +#include <armnn/TypesUtils.hpp> + +namespace armnn +{ + +FusedLayer::FusedLayer(const FusedDescriptor& param, const char* name) + : LayerWithParameters(param.m_NumInputSlots, param.m_NumOutputSlots, LayerType::Fused, param, name) +{} + +FusedLayer::~FusedLayer() +{} + +FusedLayer* FusedLayer::Clone(Graph& graph) const +{ + FusedLayer* clonedLayer = CloneBase<FusedLayer>(graph, m_Param, GetName()); + clonedLayer->m_AdditionalInfoObject = const_cast<FusedLayer*>(this)->m_AdditionalInfoObject; + return clonedLayer; +} + +std::unique_ptr<IWorkload> FusedLayer::CreateWorkload(const armnn::IWorkloadFactory& factory) const +{ + FusedQueueDescriptor descriptor; + SetAdditionalInfo(descriptor); + + return factory.CreateWorkload(LayerType::Fused, descriptor, PrepInfoAndDesc(descriptor)); +} + +void FusedLayer::ValidateTensorShapesFromInputs() +{ + // NOTE: since the FusedLayer is an internal layer created from a valid SubgraphView, + // we do not need to validate its input shapes +} + +void FusedLayer::ExecuteStrategy(IStrategy& strategy) const +{ + strategy.ExecuteStrategy(this, GetParameters(), {}, GetName()); +} + +} // namespace armnn diff --git a/src/armnn/layers/FusedLayer.hpp b/src/armnn/layers/FusedLayer.hpp new file mode 100644 index 0000000000..e26a379707 --- /dev/null +++ b/src/armnn/layers/FusedLayer.hpp @@ -0,0 +1,38 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "LayerWithParameters.hpp" +#include <armnn/backends/WorkloadFactory.hpp> + +#include <armnn/Descriptors.hpp> + +#include <memory> +#include <functional> + +namespace armnn +{ + +class FusedLayer : public LayerWithParameters<FusedDescriptor> +{ +public: + FusedLayer(const FusedDescriptor& param, const char* name); + ~FusedLayer(); + + virtual std::unique_ptr<IWorkload> CreateWorkload(const IWorkloadFactory& factory) const override; + + FusedLayer* Clone(Graph &graph) const override; + + void ValidateTensorShapesFromInputs() override; + + void ExecuteStrategy(IStrategy& strategy) const override; + +private: + FusedLayer(const FusedLayer& other) = delete; + FusedLayer& operator=(const FusedLayer& other) = delete; +}; + +} // namespace armnn diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp index aa6bb848e5..d0f6eea3d4 100644 --- a/src/backends/backendsCommon/WorkloadData.cpp +++ b/src/backends/backendsCommon/WorkloadData.cpp @@ -1093,6 +1093,11 @@ void FullyConnectedQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c } } +void FusedQueueDescriptor::Validate(const WorkloadInfo& /*workloadInfo*/) const +{ + // This is internally generated, so it should not need validation. +} + void NormalizationQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const { const std::string descriptorName{"NormalizationQueueDescriptor"}; @@ -3003,7 +3008,7 @@ void SwitchQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const void PreCompiledQueueDescriptor::Validate(const WorkloadInfo& /*workloadInfo*/) const { - // This is internally generated so it should not need validation. + // This is internally generated, so it should not need validation. } void PreluQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp index f067056ce1..6ff237bc12 100644 --- a/src/backends/backendsCommon/WorkloadFactory.cpp +++ b/src/backends/backendsCommon/WorkloadFactory.cpp @@ -525,6 +525,36 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId, reason); break; } + case LayerType::Fused: + { + auto cLayer = PolymorphicDowncast<const FusedLayer*>(&layer); + + // Get vector of all outputs. + auto getOutTensorInfo = [&dataType](const OutputSlot& slot) + { + return OverrideDataType(slot.GetTensorInfo(), dataType); + }; + auto beginOutputs = MakeTransformIterator(layer.GetOutputSlots().begin(), getOutTensorInfo); + auto endOutputs = MakeTransformIterator(layer.GetOutputSlots().end(), getOutTensorInfo); + std::vector<TensorInfo> outputs(beginOutputs, endOutputs); + const std::vector<std::reference_wrapper<TensorInfo>> outputPtrs(outputs.begin(), outputs.end()); + + // Get vector of all inputs. + auto getInputTensorInfo = [&dataType](const InputSlot& slot) + { + return OverrideDataType(slot.GetTensorInfo(), dataType); + }; + auto beginInputs = MakeTransformIterator(layer.GetInputSlots().begin(), getInputTensorInfo); + auto endInputs = MakeTransformIterator(layer.GetInputSlots().end(), getInputTensorInfo); + std::vector<TensorInfo> inputs(beginInputs, endInputs); + const std::vector<std::reference_wrapper<TensorInfo>> inputPtrs(inputs.begin(), inputs.end()); + + result = layerSupportObject.IsFusedSupported(inputPtrs, + outputPtrs, + cLayer->GetParameters(), + reason); + break; + } case LayerType::Gather: { const TensorInfo& input0 = layer.GetInputSlot(0).GetTensorInfo(); diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt index 5d8fb1a953..8f3a22d53b 100644 --- a/src/backends/backendsCommon/test/CMakeLists.txt +++ b/src/backends/backendsCommon/test/CMakeLists.txt @@ -71,6 +71,7 @@ list(APPEND armnnBackendsCommonUnitTests_sources layerTests/ActivationTestImpl.hpp layerTests/AdditionTestImpl.cpp layerTests/AdditionTestImpl.hpp + layerTests/AddMulAddTestImpl.hpp layerTests/ArgMinMaxTestImpl.cpp layerTests/ArgMinMaxTestImpl.hpp layerTests/BatchMatMulTestImpl.cpp diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp index ff02e06859..e8a2ec6931 100644 --- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp +++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp @@ -678,6 +678,8 @@ DECLARE_LAYER_POLICY_1_PARAM(Floor) DECLARE_LAYER_POLICY_2_PARAM(FullyConnected) +DECLARE_LAYER_POLICY_2_PARAM(Fused) + DECLARE_LAYER_POLICY_2_PARAM(Gather) DECLARE_LAYER_POLICY_1_PARAM(GatherNd) diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp index 7182cb2d47..3f8d045c06 100644 --- a/src/backends/backendsCommon/test/LayerTests.hpp +++ b/src/backends/backendsCommon/test/LayerTests.hpp @@ -8,6 +8,7 @@ #include <backendsCommon/test/layerTests/AbsTestImpl.hpp> #include <backendsCommon/test/layerTests/ActivationTestImpl.hpp> #include <backendsCommon/test/layerTests/AdditionTestImpl.hpp> +#include <backendsCommon/test/layerTests/AddMulAddTestImpl.hpp> #include <backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp> #include <backendsCommon/test/layerTests/BatchMatMulTestImpl.hpp> #include <backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp> diff --git a/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp new file mode 100644 index 0000000000..9dece9be3b --- /dev/null +++ b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp @@ -0,0 +1,182 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <armnnTestUtils/LayerTestResult.hpp> + +#include <armnnUtils/QuantizeHelper.hpp> +#include <ResolveType.hpp> + +#include <armnn/backends/IBackendInternal.hpp> +#include <armnn/backends/WorkloadFactory.hpp> + +#include <armnnTestUtils/TensorCopyUtils.hpp> +#include <backendsCommon/test/WorkloadFactoryHelper.hpp> +#include <armnnTestUtils/WorkloadTestUtils.hpp> + +#include <armnnTestUtils/TensorHelpers.hpp> + +template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>> +std::vector<LayerTestResult<T,4>> AddMulAddTest(armnn::IWorkloadFactory& workloadFactory, + const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, + const armnn::ITensorHandleFactory& tensorHandleFactory, + bool addOutput) +{ + using namespace armnn; + IgnoreUnused(memoryManager); + + TensorInfo input0TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + TensorInfo input1TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + TensorInfo mulInput1TensorInfo({ 3 }, ArmnnType); + TensorInfo addInput1TensorInfo({ 3 }, ArmnnType); + + TensorInfo output0TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + TensorInfo output1TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + + if (IsQuantizedType<T>()) + { + input0TensorInfo.SetQuantizationScale(0.25f); + input0TensorInfo.SetQuantizationOffset(128); + input1TensorInfo.SetQuantizationScale(0.25f); + input1TensorInfo.SetQuantizationOffset(128); + mulInput1TensorInfo.SetQuantizationScale(0.25f); + mulInput1TensorInfo.SetQuantizationOffset(128); + addInput1TensorInfo.SetQuantizationScale(0.25f); + addInput1TensorInfo.SetQuantizationOffset(128); + + output0TensorInfo.SetQuantizationScale(0.5f); + output0TensorInfo.SetQuantizationOffset(120); + output1TensorInfo.SetQuantizationScale(0.5f); + output1TensorInfo.SetQuantizationOffset(120); + } + + std::vector<float> input0Data + { + 0.0f, 0.0f, 0.0f, + 1.0f, 1.0f, 1.0f, + -1.0f, -1.0f, -1.0f, + -2.0f, -2.0f, -2.0f + }; + std::vector<float> input1Data + { + 0.0f, 0.0f, 0.0f, + 1.0f, 1.0f, 1.0f, + -1.0f, -1.0f, -1.0f, + -2.0f, -2.0f, -2.0f + }; + std::vector<float> mulInput1Data + { + 2.0f, 1.0f, 1.0f + }; + std::vector<float> addInput1Data + { + 3.0f, 0.0f, 0.0f + }; + std::vector<float> output0ExpectedData = + { + 0.0f, 0.0f, 0.0f, + 2.0f, 2.0f, 2.0f, + -2.0f, -2.0f, -2.0f, + -4.0f, -4.0f, -4.0f + }; + + std::vector<float> output1ExpectedData = + { + 3.0f, 0.0f, 0.0f, + 7.0f, 2.0f, 2.0f, + -1.0f, -2.0f, -2.0f, + -5.0f, -4.0f, -4.0f + }; + + std::vector<T> input0 = armnnUtils::QuantizedVector<T>(input0Data, + input0TensorInfo.GetQuantizationScale(), + input0TensorInfo.GetQuantizationOffset()); + + std::vector<T> input1 = armnnUtils::QuantizedVector<T>(input1Data, + input1TensorInfo.GetQuantizationScale(), + input1TensorInfo.GetQuantizationOffset()); + + std::vector<T> mulInput1 = armnnUtils::QuantizedVector<T>(mulInput1Data, + mulInput1TensorInfo.GetQuantizationScale(), + mulInput1TensorInfo.GetQuantizationOffset()); + + std::vector<T> addInput1 = armnnUtils::QuantizedVector<T>(addInput1Data, + addInput1TensorInfo.GetQuantizationScale(), + addInput1TensorInfo.GetQuantizationOffset()); + + std::vector<T> output0Expected = armnnUtils::QuantizedVector<T>(output0ExpectedData, + output0TensorInfo.GetQuantizationScale(), + output0TensorInfo.GetQuantizationOffset()); + + std::vector<T> output1Expected = armnnUtils::QuantizedVector<T>(output1ExpectedData, + output1TensorInfo.GetQuantizationScale(), + output1TensorInfo.GetQuantizationOffset()); + + std::vector<T> output0Actual(output0TensorInfo.GetNumElements()); + std::vector<T> output1Actual(output1TensorInfo.GetNumElements()); + + std::unique_ptr<ITensorHandle> input0Handle = tensorHandleFactory.CreateTensorHandle(input0TensorInfo); + std::unique_ptr<ITensorHandle> input1Handle = tensorHandleFactory.CreateTensorHandle(input1TensorInfo); + std::unique_ptr<ITensorHandle> mulInput1Handle = tensorHandleFactory.CreateTensorHandle(mulInput1TensorInfo); + std::unique_ptr<ITensorHandle> addInput1Handle = tensorHandleFactory.CreateTensorHandle(addInput1TensorInfo); + std::unique_ptr<ITensorHandle> output0Handle = tensorHandleFactory.CreateTensorHandle(output0TensorInfo); + std::unique_ptr<ITensorHandle> output1Handle = tensorHandleFactory.CreateTensorHandle(output1TensorInfo); + + uint32_t numOutputs = addOutput ? 2 : 1; + FusedDescriptor descriptor(4, numOutputs, FusedKernelType::AddMulAdd); + FusedQueueDescriptor fusedQueueDescriptor; + fusedQueueDescriptor.m_Parameters = descriptor; + WorkloadInfo info; + AddInputToWorkload (fusedQueueDescriptor, info, input0TensorInfo, input0Handle.get()); + AddInputToWorkload (fusedQueueDescriptor, info, input1TensorInfo, input1Handle.get()); + AddInputToWorkload (fusedQueueDescriptor, info, mulInput1TensorInfo, mulInput1Handle.get()); + AddInputToWorkload (fusedQueueDescriptor, info, addInput1TensorInfo, addInput1Handle.get()); + if (addOutput) + { + AddOutputToWorkload(fusedQueueDescriptor, info, output0TensorInfo, output0Handle.get()); + } + AddOutputToWorkload(fusedQueueDescriptor, info, output1TensorInfo, output1Handle.get()); + + std::unique_ptr<IWorkload> workload = workloadFactory.CreateWorkload(LayerType::Fused, + fusedQueueDescriptor, + info); + + input0Handle->Allocate(); + input1Handle->Allocate(); + mulInput1Handle->Allocate(); + addInput1Handle->Allocate(); + if (addOutput) + { + output0Handle->Allocate(); + } + output1Handle->Allocate(); + + CopyDataToITensorHandle(input0Handle.get(), input0.data()); + CopyDataToITensorHandle(input1Handle.get(), input1.data()); + CopyDataToITensorHandle(mulInput1Handle.get(), mulInput1.data()); + CopyDataToITensorHandle(addInput1Handle.get(), addInput1.data()); + + workload->Execute(); + + CopyDataFromITensorHandle(output1Actual.data(), output1Handle.get()); + LayerTestResult<T,4> ret1(output1Actual, + output1Expected, + output1Handle->GetShape(), + output1TensorInfo.GetShape()); + + std::vector<LayerTestResult<T,4>> ret = {ret1}; + + if (addOutput) + { + CopyDataFromITensorHandle(output0Actual.data(), output0Handle.get()); + LayerTestResult<T,4> ret0(output0Actual, + output0Expected, + output0Handle->GetShape(), + output0TensorInfo.GetShape()); + ret = {ret0, ret1}; + } + return ret; +} diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp index 60dab0538a..1acaba0384 100644 --- a/src/backends/cl/ClLayerSupport.cpp +++ b/src/backends/cl/ClLayerSupport.cpp @@ -682,7 +682,7 @@ bool ClLayerSupport::IsLayerSupported(const LayerType& type, default: // layers not supported in cl by default: // debug, detectionpostprocess, fakequantization, - // precompiled, standin, switch, pooling3d + // precompiled, standin, switch, pooling3d, fused return false; } } diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index b491ba8493..ef1d21835a 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -4,7 +4,6 @@ // #include "NeonLayerSupport.hpp" -#include "NeonBackendId.hpp" #include "NeonBackendModelContext.hpp" #include <armnn/Exceptions.hpp> @@ -12,7 +11,6 @@ #include <armnn/Types.hpp> #include <armnn/BackendRegistry.hpp> -#include <InternalTypes.hpp> #include <LayerSupportCommon.hpp> #include <armnn/utility/IgnoreUnused.hpp> #include <armnn/utility/PolymorphicDowncast.hpp> @@ -39,8 +37,13 @@ #include "workloads/NeonDepthToSpaceWorkload.hpp" #include "workloads/NeonDepthwiseConvolutionWorkload.hpp" #include "workloads/NeonDequantizeWorkload.hpp" +#include "workloads/NeonDivisionWorkload.hpp" #include "workloads/NeonElementwiseBinaryWorkload.hpp" #include "workloads/NeonExpWorkload.hpp" +#include "workloads/NeonFullyConnectedWorkload.hpp" +#include "workloads/NeonFusedWorkload.hpp" +#include "workloads/NeonGatherWorkload.hpp" +#include "workloads/NeonGatherNdWorkload.hpp" #include "workloads/NeonInstanceNormalizationWorkload.hpp" #include "workloads/NeonL2NormalizationFloatWorkload.hpp" #include "workloads/NeonLogWorkload.hpp" @@ -53,12 +56,8 @@ #include "workloads/NeonMeanWorkload.hpp" #include "workloads/NeonMinimumWorkload.hpp" #include "workloads/NeonMultiplicationWorkload.hpp" -#include "workloads/NeonDivisionWorkload.hpp" #include "workloads/NeonNegWorkload.hpp" #include "workloads/NeonNormalizationFloatWorkload.hpp" -#include "workloads/NeonFullyConnectedWorkload.hpp" -#include "workloads/NeonGatherWorkload.hpp" -#include "workloads/NeonGatherNdWorkload.hpp" #include "workloads/NeonPadWorkload.hpp" #include "workloads/NeonPermuteWorkload.hpp" #include "workloads/NeonPooling2dWorkload.hpp" @@ -128,13 +127,13 @@ bool IsSupportedForDataTypeNeon(Optional<std::string&> reasonIfUnsupported, { return IsNeonBackendSupported(reasonIfUnsupported) && IsSupportedForDataTypeGeneric(reasonIfUnsupported, - dataType, - floatFuncPtr, - floatFuncPtr, - uint8FuncPtr, - &FalseFunc<>, - &FalseFunc<>, - std::forward<Params>(params)...); + dataType, + floatFuncPtr, + floatFuncPtr, + uint8FuncPtr, + &FalseFunc<>, + &FalseFunc<>, + std::forward<Params>(params)...); } #if defined(ARMCOMPUTENEON_ENABLED) @@ -430,6 +429,22 @@ bool IsLayerTypeSupported(const LayerType& type, *(PolymorphicDowncast<const FullyConnectedDescriptor*>(&descriptor)), reasonIfUnsupported); + case LayerType::Fused: + { + auto fusedDescriptor = *(PolymorphicDowncast<const FusedDescriptor*>(&descriptor)); + if (fusedDescriptor.m_NumInputSlots + fusedDescriptor.m_NumOutputSlots != infos.size()) + { + throw InvalidArgumentException("Invalid number of FusedLayer TensorInfos."); + } + + std::vector<TensorInfo> inputInfos(infos.begin(), infos.begin() + fusedDescriptor.m_NumInputSlots); + std::vector<TensorInfo> outputInfos(infos.begin() + fusedDescriptor.m_NumInputSlots, infos.end()); + + return support.IsFusedSupported({inputInfos.begin(), inputInfos.end()}, + {outputInfos.begin(), outputInfos.end()}, + fusedDescriptor, + reasonIfUnsupported); + } case LayerType::Gather: return support.IsGatherSupported(infos[0], infos[1], @@ -1155,6 +1170,19 @@ bool NeonLayerSupport::IsFullyConnectedSupported(const TensorInfo& input, nullptr); } +bool NeonLayerSupport::IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs, + const std::vector<std::reference_wrapper<TensorInfo>>& outputs, + const FusedDescriptor& descriptor, + Optional<std::string&> reasonIfUnsupported) const +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFusedWorkloadValidate, + reasonIfUnsupported, + inputs, + outputs, + descriptor, + nullptr); +} + bool NeonLayerSupport::IsGatherSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp index 4bc96acd30..0295c2b3e2 100644 --- a/src/backends/neon/NeonLayerSupport.hpp +++ b/src/backends/neon/NeonLayerSupport.hpp @@ -151,6 +151,11 @@ public: const FullyConnectedDescriptor& descriptor, Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const; + bool IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs, + const std::vector<std::reference_wrapper<TensorInfo>>& outputs, + const FusedDescriptor& descriptor, + Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const; + bool IsGatherNdSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index e3411de254..4f131ac575 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -400,6 +400,11 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateWorkload(LayerType type, info, m_MemoryManager->GetIntraLayerManager()); } + case LayerType::Fused : + { + auto fusedQueueDescriptor = PolymorphicDowncast<const FusedQueueDescriptor*>(&descriptor); + return std::make_unique<NeonFusedWorkload>(*fusedQueueDescriptor, info); + } case LayerType::Gather : { auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor); diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk index 2c91d1491d..3961ed1e34 100644 --- a/src/backends/neon/backend.mk +++ b/src/backends/neon/backend.mk @@ -48,6 +48,7 @@ BACKEND_SOURCES := \ workloads/NeonFillWorkload.cpp \ workloads/NeonFloorFloatWorkload.cpp \ workloads/NeonFullyConnectedWorkload.cpp \ + workloads/NeonFusedWorkload.cpp \ workloads/NeonGatherWorkload.cpp \ workloads/NeonGatherNdWorkload.cpp \ workloads/NeonInstanceNormalizationWorkload.cpp \ diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index 588c90be6d..c9dd1ff507 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -1724,6 +1724,13 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Te // Convert from Float32 to Float16 ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test) +// AddMulAdd +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsFloat32, AddMulAddTest<DataType::Float32>, true) +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsUint8, AddMulAddTest<DataType::QAsymmU8>, true) + +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputFloat32, AddMulAddTest<DataType::Float32>, false) +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputUint8, AddMulAddTest<DataType::QAsymmU8>, false) + #if defined(ARMNNREF_ENABLED) // The ARMNN_COMPARE_REF_AUTO_TEST_CASE and the ARMNN_COMPARE_REF_FIXTURE_TEST_CASE test units are not available diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt index 2cb2ccf385..f4438e4baa 100644 --- a/src/backends/neon/workloads/CMakeLists.txt +++ b/src/backends/neon/workloads/CMakeLists.txt @@ -54,6 +54,8 @@ list(APPEND armnnNeonBackendWorkloads_sources NeonFloorFloatWorkload.hpp NeonFullyConnectedWorkload.cpp NeonFullyConnectedWorkload.hpp + NeonFusedWorkload.cpp + NeonFusedWorkload.hpp NeonGatherWorkload.cpp NeonGatherWorkload.hpp NeonGatherNdWorkload.cpp diff --git a/src/backends/neon/workloads/NeonFusedWorkload.cpp b/src/backends/neon/workloads/NeonFusedWorkload.cpp new file mode 100644 index 0000000000..f770f46c81 --- /dev/null +++ b/src/backends/neon/workloads/NeonFusedWorkload.cpp @@ -0,0 +1,115 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "NeonFusedWorkload.hpp" +#include "NeonWorkloadUtils.hpp" + +#include <aclCommon/ArmComputeTensorUtils.hpp> +#include <aclCommon/ArmComputeUtils.hpp> + +#include <armnn/utility/PolymorphicDowncast.hpp> +#include <armnn/backends/TensorHandle.hpp> + +#include <arm_compute/runtime/NEON/functions/NEAddMulAdd.h> + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status NeonFusedWorkloadValidate(const std::vector<std::reference_wrapper<TensorInfo>>& inputInfos, + const std::vector<std::reference_wrapper<TensorInfo>>& outputInfos, + const FusedDescriptor& fusedDescriptor, + const ActivationDescriptor* activationDescriptor) +{ + std::vector<arm_compute::TensorInfo> actInputInfos; + actInputInfos.reserve(inputInfos.size()); + for (size_t i = 0u; i < inputInfos.size(); ++i) + { + actInputInfos.emplace_back(BuildArmComputeTensorInfo(inputInfos[i])); + } + + std::vector<arm_compute::TensorInfo> actOutputInfos; + actOutputInfos.reserve(outputInfos.size()); + for (size_t i = 0u; i < outputInfos.size(); ++i) + { + actOutputInfos.emplace_back(BuildArmComputeTensorInfo(outputInfos[i])); + } + + const arm_compute::ActivationLayerInfo activationInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(activationDescriptor); + + switch (fusedDescriptor.m_FusedKernelType) + { + case FusedKernelType::AddMulAdd: + return arm_compute::NEAddMulAdd::validate( + &actInputInfos[0], + &actInputInfos[1], + &actInputInfos[2], // bn_mul + &actInputInfos[3], // bn_add + actOutputInfos.size() == 1 ? nullptr : &actOutputInfos[0], // add_output + actOutputInfos.size() == 1 ? &actOutputInfos[0] : &actOutputInfos[1], // final_output + arm_compute::ConvertPolicy::SATURATE, + activationInfo); + default: + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "NeonFusedWorkloadValidate: no valid kernel type"}; + } +} + + +NeonFusedWorkload::NeonFusedWorkload(const FusedQueueDescriptor& descriptor, const WorkloadInfo& info) + : NeonBaseWorkload<FusedQueueDescriptor>(descriptor, info) +{ + m_Data.ValidateInputsOutputs("NeonFusedWorkload", + static_cast<unsigned int>(info.m_InputTensorInfos.size()), + static_cast<unsigned int>(info.m_OutputTensorInfos.size())); + + std::vector<arm_compute::ITensor*> inputs; + inputs.reserve(info.m_InputTensorInfos.size()); + for (auto input : m_Data.m_Inputs) + { + inputs.emplace_back(&PolymorphicDowncast<IAclTensorHandle*>(input)->GetTensor()); + } + + std::vector<arm_compute::ITensor*> outputs; + outputs.reserve(info.m_OutputTensorInfos.size()); + for (auto output : m_Data.m_Outputs) + { + outputs.emplace_back(&PolymorphicDowncast<IAclTensorHandle*>(output)->GetTensor()); + } + + const arm_compute::ActivationLayerInfo activationInfo = + ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + switch (descriptor.m_Parameters.m_FusedKernelType) + { + case FusedKernelType::AddMulAdd: + { + auto layer = std::make_unique<arm_compute::NEAddMulAdd>(); + layer->configure(inputs[0], + inputs[1], + inputs[2], // bn_mul + inputs[3], // bn_add + outputs.size() == 1 ? nullptr : outputs[0], // add_output + outputs.size() == 1 ? outputs[0] : outputs[1], // final_output + arm_compute::ConvertPolicy::SATURATE, + activationInfo); + m_FusedLayer.reset(layer.release()); + break; + } + default: + throw Exception("NeonFusedWorkload: no valid kernel type."); + } +} + +void NeonFusedWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFusedWorkload_Execute", this->GetGuid()); + m_FusedLayer->run(); +} + +} //namespace armnn + diff --git a/src/backends/neon/workloads/NeonFusedWorkload.hpp b/src/backends/neon/workloads/NeonFusedWorkload.hpp new file mode 100644 index 0000000000..aaabf61560 --- /dev/null +++ b/src/backends/neon/workloads/NeonFusedWorkload.hpp @@ -0,0 +1,35 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "NeonBaseWorkload.hpp" + +#include <arm_compute/core/Error.h> +#include <arm_compute/core/Types.h> +#include <arm_compute/runtime/IFunction.h> + +namespace armnn +{ + +arm_compute::Status NeonFusedWorkloadValidate(const std::vector<std::reference_wrapper<TensorInfo>>& inputInfos, + const std::vector<std::reference_wrapper<TensorInfo>>& outputInfos, + const FusedDescriptor& fusedDescriptor, + const ActivationDescriptor* activationDescriptor = nullptr); + +class NeonFusedWorkload : public NeonBaseWorkload<FusedQueueDescriptor> +{ +public: + NeonFusedWorkload(const FusedQueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr<arm_compute::IFunction> m_FusedLayer; +}; + +} //namespace armnn + + + diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp index b72f3bb703..615e5d87c8 100644 --- a/src/backends/neon/workloads/NeonWorkloads.hpp +++ b/src/backends/neon/workloads/NeonWorkloads.hpp @@ -30,6 +30,7 @@ #include "NeonFillWorkload.hpp" #include "NeonFloorFloatWorkload.hpp" #include "NeonFullyConnectedWorkload.hpp" +#include "NeonFusedWorkload.hpp" #include "NeonGatherWorkload.hpp" #include "NeonGatherNdWorkload.hpp" #include "NeonInstanceNormalizationWorkload.hpp" diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp index 6323db1c03..0b1b9c7824 100644 --- a/src/backends/reference/RefLayerSupport.cpp +++ b/src/backends/reference/RefLayerSupport.cpp @@ -539,8 +539,8 @@ bool RefLayerSupport::IsLayerSupported(const LayerType& type, quantizedLstmInputParamsInfo.value(), reasonIfUnsupported); default: - // layers not supported in neon by default: - // precompiled, standin, switch + // layers not supported in reference by default: + // precompiled, standin, switch, fused return false; } } |