diff options
author | Teresa Charlin <teresa.charlinreyes@arm.com> | 2023-08-17 18:44:58 +0100 |
---|---|---|
committer | Teresa Charlin <teresa.charlinreyes@arm.com> | 2023-08-28 12:37:25 +0100 |
commit | 9145e38edf49fa4862008c163c34590141eecb14 (patch) | |
tree | 64706ef579f548b804d5b674b33f6b239c638d0f /src/backends | |
parent | e40cc8359b02a7786908294300c45b672cf6b0e4 (diff) | |
download | armnn-9145e38edf49fa4862008c163c34590141eecb14.tar.gz |
IVGCVSW-7505 Create FusedLayer and NeonFusedWorkload for AddMulAdd Neon kernel
Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: Ic778d35b001474b44fb1e433a6fe276e4ec9f565
Diffstat (limited to 'src/backends')
-rw-r--r-- | src/backends/backendsCommon/WorkloadData.cpp | 7 | ||||
-rw-r--r-- | src/backends/backendsCommon/WorkloadFactory.cpp | 30 | ||||
-rw-r--r-- | src/backends/backendsCommon/test/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp | 2 | ||||
-rw-r--r-- | src/backends/backendsCommon/test/LayerTests.hpp | 1 | ||||
-rw-r--r-- | src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp | 182 | ||||
-rw-r--r-- | src/backends/cl/ClLayerSupport.cpp | 2 | ||||
-rw-r--r-- | src/backends/neon/NeonLayerSupport.cpp | 54 | ||||
-rw-r--r-- | src/backends/neon/NeonLayerSupport.hpp | 5 | ||||
-rw-r--r-- | src/backends/neon/NeonWorkloadFactory.cpp | 5 | ||||
-rw-r--r-- | src/backends/neon/backend.mk | 1 | ||||
-rw-r--r-- | src/backends/neon/test/NeonLayerTests.cpp | 7 | ||||
-rw-r--r-- | src/backends/neon/workloads/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonFusedWorkload.cpp | 115 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonFusedWorkload.hpp | 35 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonWorkloads.hpp | 1 | ||||
-rw-r--r-- | src/backends/reference/RefLayerSupport.cpp | 4 |
17 files changed, 437 insertions, 17 deletions
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp index aa6bb848e5..d0f6eea3d4 100644 --- a/src/backends/backendsCommon/WorkloadData.cpp +++ b/src/backends/backendsCommon/WorkloadData.cpp @@ -1093,6 +1093,11 @@ void FullyConnectedQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c } } +void FusedQueueDescriptor::Validate(const WorkloadInfo& /*workloadInfo*/) const +{ + // This is internally generated, so it should not need validation. +} + void NormalizationQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const { const std::string descriptorName{"NormalizationQueueDescriptor"}; @@ -3003,7 +3008,7 @@ void SwitchQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const void PreCompiledQueueDescriptor::Validate(const WorkloadInfo& /*workloadInfo*/) const { - // This is internally generated so it should not need validation. + // This is internally generated, so it should not need validation. } void PreluQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp index f067056ce1..6ff237bc12 100644 --- a/src/backends/backendsCommon/WorkloadFactory.cpp +++ b/src/backends/backendsCommon/WorkloadFactory.cpp @@ -525,6 +525,36 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId, reason); break; } + case LayerType::Fused: + { + auto cLayer = PolymorphicDowncast<const FusedLayer*>(&layer); + + // Get vector of all outputs. + auto getOutTensorInfo = [&dataType](const OutputSlot& slot) + { + return OverrideDataType(slot.GetTensorInfo(), dataType); + }; + auto beginOutputs = MakeTransformIterator(layer.GetOutputSlots().begin(), getOutTensorInfo); + auto endOutputs = MakeTransformIterator(layer.GetOutputSlots().end(), getOutTensorInfo); + std::vector<TensorInfo> outputs(beginOutputs, endOutputs); + const std::vector<std::reference_wrapper<TensorInfo>> outputPtrs(outputs.begin(), outputs.end()); + + // Get vector of all inputs. + auto getInputTensorInfo = [&dataType](const InputSlot& slot) + { + return OverrideDataType(slot.GetTensorInfo(), dataType); + }; + auto beginInputs = MakeTransformIterator(layer.GetInputSlots().begin(), getInputTensorInfo); + auto endInputs = MakeTransformIterator(layer.GetInputSlots().end(), getInputTensorInfo); + std::vector<TensorInfo> inputs(beginInputs, endInputs); + const std::vector<std::reference_wrapper<TensorInfo>> inputPtrs(inputs.begin(), inputs.end()); + + result = layerSupportObject.IsFusedSupported(inputPtrs, + outputPtrs, + cLayer->GetParameters(), + reason); + break; + } case LayerType::Gather: { const TensorInfo& input0 = layer.GetInputSlot(0).GetTensorInfo(); diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt index 5d8fb1a953..8f3a22d53b 100644 --- a/src/backends/backendsCommon/test/CMakeLists.txt +++ b/src/backends/backendsCommon/test/CMakeLists.txt @@ -71,6 +71,7 @@ list(APPEND armnnBackendsCommonUnitTests_sources layerTests/ActivationTestImpl.hpp layerTests/AdditionTestImpl.cpp layerTests/AdditionTestImpl.hpp + layerTests/AddMulAddTestImpl.hpp layerTests/ArgMinMaxTestImpl.cpp layerTests/ArgMinMaxTestImpl.hpp layerTests/BatchMatMulTestImpl.cpp diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp index ff02e06859..e8a2ec6931 100644 --- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp +++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp @@ -678,6 +678,8 @@ DECLARE_LAYER_POLICY_1_PARAM(Floor) DECLARE_LAYER_POLICY_2_PARAM(FullyConnected) +DECLARE_LAYER_POLICY_2_PARAM(Fused) + DECLARE_LAYER_POLICY_2_PARAM(Gather) DECLARE_LAYER_POLICY_1_PARAM(GatherNd) diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp index 7182cb2d47..3f8d045c06 100644 --- a/src/backends/backendsCommon/test/LayerTests.hpp +++ b/src/backends/backendsCommon/test/LayerTests.hpp @@ -8,6 +8,7 @@ #include <backendsCommon/test/layerTests/AbsTestImpl.hpp> #include <backendsCommon/test/layerTests/ActivationTestImpl.hpp> #include <backendsCommon/test/layerTests/AdditionTestImpl.hpp> +#include <backendsCommon/test/layerTests/AddMulAddTestImpl.hpp> #include <backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp> #include <backendsCommon/test/layerTests/BatchMatMulTestImpl.hpp> #include <backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp> diff --git a/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp new file mode 100644 index 0000000000..9dece9be3b --- /dev/null +++ b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp @@ -0,0 +1,182 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <armnnTestUtils/LayerTestResult.hpp> + +#include <armnnUtils/QuantizeHelper.hpp> +#include <ResolveType.hpp> + +#include <armnn/backends/IBackendInternal.hpp> +#include <armnn/backends/WorkloadFactory.hpp> + +#include <armnnTestUtils/TensorCopyUtils.hpp> +#include <backendsCommon/test/WorkloadFactoryHelper.hpp> +#include <armnnTestUtils/WorkloadTestUtils.hpp> + +#include <armnnTestUtils/TensorHelpers.hpp> + +template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>> +std::vector<LayerTestResult<T,4>> AddMulAddTest(armnn::IWorkloadFactory& workloadFactory, + const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, + const armnn::ITensorHandleFactory& tensorHandleFactory, + bool addOutput) +{ + using namespace armnn; + IgnoreUnused(memoryManager); + + TensorInfo input0TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + TensorInfo input1TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + TensorInfo mulInput1TensorInfo({ 3 }, ArmnnType); + TensorInfo addInput1TensorInfo({ 3 }, ArmnnType); + + TensorInfo output0TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + TensorInfo output1TensorInfo({ 1, 2, 2, 3 }, ArmnnType); + + if (IsQuantizedType<T>()) + { + input0TensorInfo.SetQuantizationScale(0.25f); + input0TensorInfo.SetQuantizationOffset(128); + input1TensorInfo.SetQuantizationScale(0.25f); + input1TensorInfo.SetQuantizationOffset(128); + mulInput1TensorInfo.SetQuantizationScale(0.25f); + mulInput1TensorInfo.SetQuantizationOffset(128); + addInput1TensorInfo.SetQuantizationScale(0.25f); + addInput1TensorInfo.SetQuantizationOffset(128); + + output0TensorInfo.SetQuantizationScale(0.5f); + output0TensorInfo.SetQuantizationOffset(120); + output1TensorInfo.SetQuantizationScale(0.5f); + output1TensorInfo.SetQuantizationOffset(120); + } + + std::vector<float> input0Data + { + 0.0f, 0.0f, 0.0f, + 1.0f, 1.0f, 1.0f, + -1.0f, -1.0f, -1.0f, + -2.0f, -2.0f, -2.0f + }; + std::vector<float> input1Data + { + 0.0f, 0.0f, 0.0f, + 1.0f, 1.0f, 1.0f, + -1.0f, -1.0f, -1.0f, + -2.0f, -2.0f, -2.0f + }; + std::vector<float> mulInput1Data + { + 2.0f, 1.0f, 1.0f + }; + std::vector<float> addInput1Data + { + 3.0f, 0.0f, 0.0f + }; + std::vector<float> output0ExpectedData = + { + 0.0f, 0.0f, 0.0f, + 2.0f, 2.0f, 2.0f, + -2.0f, -2.0f, -2.0f, + -4.0f, -4.0f, -4.0f + }; + + std::vector<float> output1ExpectedData = + { + 3.0f, 0.0f, 0.0f, + 7.0f, 2.0f, 2.0f, + -1.0f, -2.0f, -2.0f, + -5.0f, -4.0f, -4.0f + }; + + std::vector<T> input0 = armnnUtils::QuantizedVector<T>(input0Data, + input0TensorInfo.GetQuantizationScale(), + input0TensorInfo.GetQuantizationOffset()); + + std::vector<T> input1 = armnnUtils::QuantizedVector<T>(input1Data, + input1TensorInfo.GetQuantizationScale(), + input1TensorInfo.GetQuantizationOffset()); + + std::vector<T> mulInput1 = armnnUtils::QuantizedVector<T>(mulInput1Data, + mulInput1TensorInfo.GetQuantizationScale(), + mulInput1TensorInfo.GetQuantizationOffset()); + + std::vector<T> addInput1 = armnnUtils::QuantizedVector<T>(addInput1Data, + addInput1TensorInfo.GetQuantizationScale(), + addInput1TensorInfo.GetQuantizationOffset()); + + std::vector<T> output0Expected = armnnUtils::QuantizedVector<T>(output0ExpectedData, + output0TensorInfo.GetQuantizationScale(), + output0TensorInfo.GetQuantizationOffset()); + + std::vector<T> output1Expected = armnnUtils::QuantizedVector<T>(output1ExpectedData, + output1TensorInfo.GetQuantizationScale(), + output1TensorInfo.GetQuantizationOffset()); + + std::vector<T> output0Actual(output0TensorInfo.GetNumElements()); + std::vector<T> output1Actual(output1TensorInfo.GetNumElements()); + + std::unique_ptr<ITensorHandle> input0Handle = tensorHandleFactory.CreateTensorHandle(input0TensorInfo); + std::unique_ptr<ITensorHandle> input1Handle = tensorHandleFactory.CreateTensorHandle(input1TensorInfo); + std::unique_ptr<ITensorHandle> mulInput1Handle = tensorHandleFactory.CreateTensorHandle(mulInput1TensorInfo); + std::unique_ptr<ITensorHandle> addInput1Handle = tensorHandleFactory.CreateTensorHandle(addInput1TensorInfo); + std::unique_ptr<ITensorHandle> output0Handle = tensorHandleFactory.CreateTensorHandle(output0TensorInfo); + std::unique_ptr<ITensorHandle> output1Handle = tensorHandleFactory.CreateTensorHandle(output1TensorInfo); + + uint32_t numOutputs = addOutput ? 2 : 1; + FusedDescriptor descriptor(4, numOutputs, FusedKernelType::AddMulAdd); + FusedQueueDescriptor fusedQueueDescriptor; + fusedQueueDescriptor.m_Parameters = descriptor; + WorkloadInfo info; + AddInputToWorkload (fusedQueueDescriptor, info, input0TensorInfo, input0Handle.get()); + AddInputToWorkload (fusedQueueDescriptor, info, input1TensorInfo, input1Handle.get()); + AddInputToWorkload (fusedQueueDescriptor, info, mulInput1TensorInfo, mulInput1Handle.get()); + AddInputToWorkload (fusedQueueDescriptor, info, addInput1TensorInfo, addInput1Handle.get()); + if (addOutput) + { + AddOutputToWorkload(fusedQueueDescriptor, info, output0TensorInfo, output0Handle.get()); + } + AddOutputToWorkload(fusedQueueDescriptor, info, output1TensorInfo, output1Handle.get()); + + std::unique_ptr<IWorkload> workload = workloadFactory.CreateWorkload(LayerType::Fused, + fusedQueueDescriptor, + info); + + input0Handle->Allocate(); + input1Handle->Allocate(); + mulInput1Handle->Allocate(); + addInput1Handle->Allocate(); + if (addOutput) + { + output0Handle->Allocate(); + } + output1Handle->Allocate(); + + CopyDataToITensorHandle(input0Handle.get(), input0.data()); + CopyDataToITensorHandle(input1Handle.get(), input1.data()); + CopyDataToITensorHandle(mulInput1Handle.get(), mulInput1.data()); + CopyDataToITensorHandle(addInput1Handle.get(), addInput1.data()); + + workload->Execute(); + + CopyDataFromITensorHandle(output1Actual.data(), output1Handle.get()); + LayerTestResult<T,4> ret1(output1Actual, + output1Expected, + output1Handle->GetShape(), + output1TensorInfo.GetShape()); + + std::vector<LayerTestResult<T,4>> ret = {ret1}; + + if (addOutput) + { + CopyDataFromITensorHandle(output0Actual.data(), output0Handle.get()); + LayerTestResult<T,4> ret0(output0Actual, + output0Expected, + output0Handle->GetShape(), + output0TensorInfo.GetShape()); + ret = {ret0, ret1}; + } + return ret; +} diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp index 60dab0538a..1acaba0384 100644 --- a/src/backends/cl/ClLayerSupport.cpp +++ b/src/backends/cl/ClLayerSupport.cpp @@ -682,7 +682,7 @@ bool ClLayerSupport::IsLayerSupported(const LayerType& type, default: // layers not supported in cl by default: // debug, detectionpostprocess, fakequantization, - // precompiled, standin, switch, pooling3d + // precompiled, standin, switch, pooling3d, fused return false; } } diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index b491ba8493..ef1d21835a 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -4,7 +4,6 @@ // #include "NeonLayerSupport.hpp" -#include "NeonBackendId.hpp" #include "NeonBackendModelContext.hpp" #include <armnn/Exceptions.hpp> @@ -12,7 +11,6 @@ #include <armnn/Types.hpp> #include <armnn/BackendRegistry.hpp> -#include <InternalTypes.hpp> #include <LayerSupportCommon.hpp> #include <armnn/utility/IgnoreUnused.hpp> #include <armnn/utility/PolymorphicDowncast.hpp> @@ -39,8 +37,13 @@ #include "workloads/NeonDepthToSpaceWorkload.hpp" #include "workloads/NeonDepthwiseConvolutionWorkload.hpp" #include "workloads/NeonDequantizeWorkload.hpp" +#include "workloads/NeonDivisionWorkload.hpp" #include "workloads/NeonElementwiseBinaryWorkload.hpp" #include "workloads/NeonExpWorkload.hpp" +#include "workloads/NeonFullyConnectedWorkload.hpp" +#include "workloads/NeonFusedWorkload.hpp" +#include "workloads/NeonGatherWorkload.hpp" +#include "workloads/NeonGatherNdWorkload.hpp" #include "workloads/NeonInstanceNormalizationWorkload.hpp" #include "workloads/NeonL2NormalizationFloatWorkload.hpp" #include "workloads/NeonLogWorkload.hpp" @@ -53,12 +56,8 @@ #include "workloads/NeonMeanWorkload.hpp" #include "workloads/NeonMinimumWorkload.hpp" #include "workloads/NeonMultiplicationWorkload.hpp" -#include "workloads/NeonDivisionWorkload.hpp" #include "workloads/NeonNegWorkload.hpp" #include "workloads/NeonNormalizationFloatWorkload.hpp" -#include "workloads/NeonFullyConnectedWorkload.hpp" -#include "workloads/NeonGatherWorkload.hpp" -#include "workloads/NeonGatherNdWorkload.hpp" #include "workloads/NeonPadWorkload.hpp" #include "workloads/NeonPermuteWorkload.hpp" #include "workloads/NeonPooling2dWorkload.hpp" @@ -128,13 +127,13 @@ bool IsSupportedForDataTypeNeon(Optional<std::string&> reasonIfUnsupported, { return IsNeonBackendSupported(reasonIfUnsupported) && IsSupportedForDataTypeGeneric(reasonIfUnsupported, - dataType, - floatFuncPtr, - floatFuncPtr, - uint8FuncPtr, - &FalseFunc<>, - &FalseFunc<>, - std::forward<Params>(params)...); + dataType, + floatFuncPtr, + floatFuncPtr, + uint8FuncPtr, + &FalseFunc<>, + &FalseFunc<>, + std::forward<Params>(params)...); } #if defined(ARMCOMPUTENEON_ENABLED) @@ -430,6 +429,22 @@ bool IsLayerTypeSupported(const LayerType& type, *(PolymorphicDowncast<const FullyConnectedDescriptor*>(&descriptor)), reasonIfUnsupported); + case LayerType::Fused: + { + auto fusedDescriptor = *(PolymorphicDowncast<const FusedDescriptor*>(&descriptor)); + if (fusedDescriptor.m_NumInputSlots + fusedDescriptor.m_NumOutputSlots != infos.size()) + { + throw InvalidArgumentException("Invalid number of FusedLayer TensorInfos."); + } + + std::vector<TensorInfo> inputInfos(infos.begin(), infos.begin() + fusedDescriptor.m_NumInputSlots); + std::vector<TensorInfo> outputInfos(infos.begin() + fusedDescriptor.m_NumInputSlots, infos.end()); + + return support.IsFusedSupported({inputInfos.begin(), inputInfos.end()}, + {outputInfos.begin(), outputInfos.end()}, + fusedDescriptor, + reasonIfUnsupported); + } case LayerType::Gather: return support.IsGatherSupported(infos[0], infos[1], @@ -1155,6 +1170,19 @@ bool NeonLayerSupport::IsFullyConnectedSupported(const TensorInfo& input, nullptr); } +bool NeonLayerSupport::IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs, + const std::vector<std::reference_wrapper<TensorInfo>>& outputs, + const FusedDescriptor& descriptor, + Optional<std::string&> reasonIfUnsupported) const +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFusedWorkloadValidate, + reasonIfUnsupported, + inputs, + outputs, + descriptor, + nullptr); +} + bool NeonLayerSupport::IsGatherSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp index 4bc96acd30..0295c2b3e2 100644 --- a/src/backends/neon/NeonLayerSupport.hpp +++ b/src/backends/neon/NeonLayerSupport.hpp @@ -151,6 +151,11 @@ public: const FullyConnectedDescriptor& descriptor, Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const; + bool IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs, + const std::vector<std::reference_wrapper<TensorInfo>>& outputs, + const FusedDescriptor& descriptor, + Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const; + bool IsGatherNdSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index e3411de254..4f131ac575 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -400,6 +400,11 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateWorkload(LayerType type, info, m_MemoryManager->GetIntraLayerManager()); } + case LayerType::Fused : + { + auto fusedQueueDescriptor = PolymorphicDowncast<const FusedQueueDescriptor*>(&descriptor); + return std::make_unique<NeonFusedWorkload>(*fusedQueueDescriptor, info); + } case LayerType::Gather : { auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor); diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk index 2c91d1491d..3961ed1e34 100644 --- a/src/backends/neon/backend.mk +++ b/src/backends/neon/backend.mk @@ -48,6 +48,7 @@ BACKEND_SOURCES := \ workloads/NeonFillWorkload.cpp \ workloads/NeonFloorFloatWorkload.cpp \ workloads/NeonFullyConnectedWorkload.cpp \ + workloads/NeonFusedWorkload.cpp \ workloads/NeonGatherWorkload.cpp \ workloads/NeonGatherNdWorkload.cpp \ workloads/NeonInstanceNormalizationWorkload.cpp \ diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index 588c90be6d..c9dd1ff507 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -1724,6 +1724,13 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Te // Convert from Float32 to Float16 ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test) +// AddMulAdd +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsFloat32, AddMulAddTest<DataType::Float32>, true) +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsUint8, AddMulAddTest<DataType::QAsymmU8>, true) + +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputFloat32, AddMulAddTest<DataType::Float32>, false) +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputUint8, AddMulAddTest<DataType::QAsymmU8>, false) + #if defined(ARMNNREF_ENABLED) // The ARMNN_COMPARE_REF_AUTO_TEST_CASE and the ARMNN_COMPARE_REF_FIXTURE_TEST_CASE test units are not available diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt index 2cb2ccf385..f4438e4baa 100644 --- a/src/backends/neon/workloads/CMakeLists.txt +++ b/src/backends/neon/workloads/CMakeLists.txt @@ -54,6 +54,8 @@ list(APPEND armnnNeonBackendWorkloads_sources NeonFloorFloatWorkload.hpp NeonFullyConnectedWorkload.cpp NeonFullyConnectedWorkload.hpp + NeonFusedWorkload.cpp + NeonFusedWorkload.hpp NeonGatherWorkload.cpp NeonGatherWorkload.hpp NeonGatherNdWorkload.cpp diff --git a/src/backends/neon/workloads/NeonFusedWorkload.cpp b/src/backends/neon/workloads/NeonFusedWorkload.cpp new file mode 100644 index 0000000000..f770f46c81 --- /dev/null +++ b/src/backends/neon/workloads/NeonFusedWorkload.cpp @@ -0,0 +1,115 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "NeonFusedWorkload.hpp" +#include "NeonWorkloadUtils.hpp" + +#include <aclCommon/ArmComputeTensorUtils.hpp> +#include <aclCommon/ArmComputeUtils.hpp> + +#include <armnn/utility/PolymorphicDowncast.hpp> +#include <armnn/backends/TensorHandle.hpp> + +#include <arm_compute/runtime/NEON/functions/NEAddMulAdd.h> + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status NeonFusedWorkloadValidate(const std::vector<std::reference_wrapper<TensorInfo>>& inputInfos, + const std::vector<std::reference_wrapper<TensorInfo>>& outputInfos, + const FusedDescriptor& fusedDescriptor, + const ActivationDescriptor* activationDescriptor) +{ + std::vector<arm_compute::TensorInfo> actInputInfos; + actInputInfos.reserve(inputInfos.size()); + for (size_t i = 0u; i < inputInfos.size(); ++i) + { + actInputInfos.emplace_back(BuildArmComputeTensorInfo(inputInfos[i])); + } + + std::vector<arm_compute::TensorInfo> actOutputInfos; + actOutputInfos.reserve(outputInfos.size()); + for (size_t i = 0u; i < outputInfos.size(); ++i) + { + actOutputInfos.emplace_back(BuildArmComputeTensorInfo(outputInfos[i])); + } + + const arm_compute::ActivationLayerInfo activationInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(activationDescriptor); + + switch (fusedDescriptor.m_FusedKernelType) + { + case FusedKernelType::AddMulAdd: + return arm_compute::NEAddMulAdd::validate( + &actInputInfos[0], + &actInputInfos[1], + &actInputInfos[2], // bn_mul + &actInputInfos[3], // bn_add + actOutputInfos.size() == 1 ? nullptr : &actOutputInfos[0], // add_output + actOutputInfos.size() == 1 ? &actOutputInfos[0] : &actOutputInfos[1], // final_output + arm_compute::ConvertPolicy::SATURATE, + activationInfo); + default: + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "NeonFusedWorkloadValidate: no valid kernel type"}; + } +} + + +NeonFusedWorkload::NeonFusedWorkload(const FusedQueueDescriptor& descriptor, const WorkloadInfo& info) + : NeonBaseWorkload<FusedQueueDescriptor>(descriptor, info) +{ + m_Data.ValidateInputsOutputs("NeonFusedWorkload", + static_cast<unsigned int>(info.m_InputTensorInfos.size()), + static_cast<unsigned int>(info.m_OutputTensorInfos.size())); + + std::vector<arm_compute::ITensor*> inputs; + inputs.reserve(info.m_InputTensorInfos.size()); + for (auto input : m_Data.m_Inputs) + { + inputs.emplace_back(&PolymorphicDowncast<IAclTensorHandle*>(input)->GetTensor()); + } + + std::vector<arm_compute::ITensor*> outputs; + outputs.reserve(info.m_OutputTensorInfos.size()); + for (auto output : m_Data.m_Outputs) + { + outputs.emplace_back(&PolymorphicDowncast<IAclTensorHandle*>(output)->GetTensor()); + } + + const arm_compute::ActivationLayerInfo activationInfo = + ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + switch (descriptor.m_Parameters.m_FusedKernelType) + { + case FusedKernelType::AddMulAdd: + { + auto layer = std::make_unique<arm_compute::NEAddMulAdd>(); + layer->configure(inputs[0], + inputs[1], + inputs[2], // bn_mul + inputs[3], // bn_add + outputs.size() == 1 ? nullptr : outputs[0], // add_output + outputs.size() == 1 ? outputs[0] : outputs[1], // final_output + arm_compute::ConvertPolicy::SATURATE, + activationInfo); + m_FusedLayer.reset(layer.release()); + break; + } + default: + throw Exception("NeonFusedWorkload: no valid kernel type."); + } +} + +void NeonFusedWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFusedWorkload_Execute", this->GetGuid()); + m_FusedLayer->run(); +} + +} //namespace armnn + diff --git a/src/backends/neon/workloads/NeonFusedWorkload.hpp b/src/backends/neon/workloads/NeonFusedWorkload.hpp new file mode 100644 index 0000000000..aaabf61560 --- /dev/null +++ b/src/backends/neon/workloads/NeonFusedWorkload.hpp @@ -0,0 +1,35 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "NeonBaseWorkload.hpp" + +#include <arm_compute/core/Error.h> +#include <arm_compute/core/Types.h> +#include <arm_compute/runtime/IFunction.h> + +namespace armnn +{ + +arm_compute::Status NeonFusedWorkloadValidate(const std::vector<std::reference_wrapper<TensorInfo>>& inputInfos, + const std::vector<std::reference_wrapper<TensorInfo>>& outputInfos, + const FusedDescriptor& fusedDescriptor, + const ActivationDescriptor* activationDescriptor = nullptr); + +class NeonFusedWorkload : public NeonBaseWorkload<FusedQueueDescriptor> +{ +public: + NeonFusedWorkload(const FusedQueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr<arm_compute::IFunction> m_FusedLayer; +}; + +} //namespace armnn + + + diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp index b72f3bb703..615e5d87c8 100644 --- a/src/backends/neon/workloads/NeonWorkloads.hpp +++ b/src/backends/neon/workloads/NeonWorkloads.hpp @@ -30,6 +30,7 @@ #include "NeonFillWorkload.hpp" #include "NeonFloorFloatWorkload.hpp" #include "NeonFullyConnectedWorkload.hpp" +#include "NeonFusedWorkload.hpp" #include "NeonGatherWorkload.hpp" #include "NeonGatherNdWorkload.hpp" #include "NeonInstanceNormalizationWorkload.hpp" diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp index 6323db1c03..0b1b9c7824 100644 --- a/src/backends/reference/RefLayerSupport.cpp +++ b/src/backends/reference/RefLayerSupport.cpp @@ -539,8 +539,8 @@ bool RefLayerSupport::IsLayerSupported(const LayerType& type, quantizedLstmInputParamsInfo.value(), reasonIfUnsupported); default: - // layers not supported in neon by default: - // precompiled, standin, switch + // layers not supported in reference by default: + // precompiled, standin, switch, fused return false; } } |