diff options
-rw-r--r-- | delegate/src/test/GatherNdTest.cpp | 24 | ||||
-rw-r--r-- | docs/02_operator_list.dox | 16 | ||||
-rw-r--r-- | src/backends/neon/NeonLayerSupport.cpp | 18 | ||||
-rw-r--r-- | src/backends/neon/NeonLayerSupport.hpp | 5 | ||||
-rw-r--r-- | src/backends/neon/NeonWorkloadFactory.cpp | 5 | ||||
-rw-r--r-- | src/backends/neon/backend.mk | 1 | ||||
-rw-r--r-- | src/backends/neon/test/NeonLayerTests.cpp | 11 | ||||
-rw-r--r-- | src/backends/neon/workloads/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonGatherNdWorkload.cpp | 147 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonGatherNdWorkload.hpp | 41 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonWorkloads.hpp | 1 |
11 files changed, 254 insertions, 17 deletions
diff --git a/delegate/src/test/GatherNdTest.cpp b/delegate/src/test/GatherNdTest.cpp index b56a931d27..2b4fd4207e 100644 --- a/delegate/src/test/GatherNdTest.cpp +++ b/delegate/src/test/GatherNdTest.cpp @@ -19,13 +19,13 @@ namespace armnnDelegate void GatherNdUint8Test(std::vector<armnn::BackendId>& backends) { - std::vector<int32_t> paramsShape{8}; - std::vector<int32_t> indicesShape{3,1}; - std::vector<int32_t> expectedOutputShape{3}; + std::vector<int32_t> paramsShape{ 5, 2 }; + std::vector<int32_t> indicesShape{ 3, 1 }; + std::vector<int32_t> expectedOutputShape{ 3, 2 }; - std::vector<uint8_t> paramsValues{1, 2, 3, 4, 5, 6, 7, 8}; - std::vector<int32_t> indicesValues{7, 6, 5}; - std::vector<uint8_t> expectedOutputValues{8, 7, 6}; + std::vector<uint8_t> paramsValues{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + std::vector<int32_t> indicesValues{ 1, 0, 4 }; + std::vector<uint8_t> expectedOutputValues{ 3, 4, 1, 2, 9, 10 }; GatherNdTest<uint8_t>(::tflite::TensorType_UINT8, backends, @@ -39,13 +39,13 @@ void GatherNdUint8Test(std::vector<armnn::BackendId>& backends) void GatherNdFp32Test(std::vector<armnn::BackendId>& backends) { - std::vector<int32_t> paramsShape{8}; - std::vector<int32_t> indicesShape{3,1}; - std::vector<int32_t> expectedOutputShape{3}; + std::vector<int32_t> paramsShape{ 5, 2 }; + std::vector<int32_t> indicesShape{ 3, 1 }; + std::vector<int32_t> expectedOutputShape{ 3, 2 }; - std::vector<float> paramsValues{1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f}; - std::vector<int32_t> indicesValues{7, 6, 5}; - std::vector<float> expectedOutputValues{8.8f, 7.7f, 6.6f}; + std::vector<float> paramsValues{ 1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f, 9.9f, 10.10f }; + std::vector<int32_t> indicesValues{ 1, 0, 4 }; + std::vector<float> expectedOutputValues{ 3.3f, 4.4f, 1.1f, 2.2f, 9.9f, 10.10f }; GatherNdTest<float>(::tflite::TensorType_FLOAT32, backends, diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox index b29d56f6b5..a02b4da2ff 100644 --- a/docs/02_operator_list.dox +++ b/docs/02_operator_list.dox @@ -1481,13 +1481,19 @@ where N = batches, C = channels, H = height, W = width <td>CpuAcc <td> <ul> - <li>TBD + <li>All </ul> <td> - <table> - <tr><th> - <tr><td>TBD - </table> + <table> + <tr><th> + <tr><td>BFLOAT16 + <tr><td>FLOAT16 + <tr><td>FLOAT32 + <tr><td>QASYMMS8 + <tr><td>QASYMMU8 + <tr><td>QSYMMS16 + <tr><td>SIGNED32 + </table> <tr> <td>GpuAcc <td> diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index 210535536e..26b650b49d 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -54,6 +54,7 @@ #include "workloads/NeonNormalizationFloatWorkload.hpp" #include "workloads/NeonFullyConnectedWorkload.hpp" #include "workloads/NeonGatherWorkload.hpp" +#include "workloads/NeonGatherNdWorkload.hpp" #include "workloads/NeonPadWorkload.hpp" #include "workloads/NeonPermuteWorkload.hpp" #include "workloads/NeonPooling2dWorkload.hpp" @@ -349,6 +350,11 @@ bool NeonLayerSupport::IsLayerSupported(const LayerType& type, infos[2], *(PolymorphicDowncast<const GatherDescriptor*>(&descriptor)), reasonIfUnsupported); + case LayerType::GatherNd: + return IsGatherNdSupported(infos[0], + infos[1], + infos[2], + reasonIfUnsupported); case LayerType::Input: return IsInputSupported(infos[0], reasonIfUnsupported); case LayerType::InstanceNormalization: @@ -998,6 +1004,18 @@ bool NeonLayerSupport::IsGatherSupported(const TensorInfo& input0, descriptor); } +bool NeonLayerSupport::IsGatherNdSupported(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + Optional<std::string&> reasonIfUnsupported) const +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonGatherNdWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); +} + bool NeonLayerSupport::IsInputSupported(const TensorInfo& input, Optional<std::string&> reasonIfUnsupported) const { diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp index 511bb035d2..b82351872b 100644 --- a/src/backends/neon/NeonLayerSupport.hpp +++ b/src/backends/neon/NeonLayerSupport.hpp @@ -153,6 +153,11 @@ public: const FullyConnectedDescriptor& descriptor, Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override; + bool IsGatherNdSupported(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + Optional<std::string&> reasonIfUnsupported) const; + bool IsGatherSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index c83e8b3e6d..cbed690733 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -346,6 +346,11 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateWorkload(LayerType type, auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor); return std::make_unique<NeonGatherWorkload>(*gatherQueueDescriptor, info); } + case LayerType::GatherNd : + { + auto gatherNdQueueDescriptor = PolymorphicDowncast<const GatherNdQueueDescriptor*>(&descriptor); + return std::make_unique<NeonGatherNdWorkload>(*gatherNdQueueDescriptor, info); + } case LayerType::Input : { auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor); diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk index 0d6fd8035c..c522df6bad 100644 --- a/src/backends/neon/backend.mk +++ b/src/backends/neon/backend.mk @@ -48,6 +48,7 @@ BACKEND_SOURCES := \ workloads/NeonFloorFloatWorkload.cpp \ workloads/NeonFullyConnectedWorkload.cpp \ workloads/NeonGatherWorkload.cpp \ + workloads/NeonGatherNdWorkload.cpp \ workloads/NeonInstanceNormalizationWorkload.cpp \ workloads/NeonL2NormalizationFloatWorkload.cpp \ workloads/NeonLogWorkload.cpp \ diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index 1ac6c78ec7..e0811022f9 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -781,6 +781,17 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(Gather1dParamsUint8, Gather1dParamsUint8Test) ARMNN_AUTO_TEST_CASE_WITH_THF(GatherMultiDimParamsFloat32, GatherMultiDimParamsFloat32Test) ARMNN_AUTO_TEST_CASE_WITH_THF(GatherMultiDimParamsUint8, GatherMultiDimParamsUint8Test) +// GatherNd +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dFloat32, SimpleGatherNd2dTest<DataType::Float32>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dFloat32, SimpleGatherNd3dTest<DataType::Float32>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dFloat32, SimpleGatherNd4dTest<DataType::Float32>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dInt8, SimpleGatherNd2dTest<DataType::QAsymmS8>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dInt8, SimpleGatherNd3dTest<DataType::QAsymmS8>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dInt8, SimpleGatherNd4dTest<DataType::QAsymmS8>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dInt32, SimpleGatherNd2dTest<DataType::Signed32>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dInt32, SimpleGatherNd3dTest<DataType::Signed32>) +ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dInt32, SimpleGatherNd4dTest<DataType::Signed32>) + // Equal ARMNN_AUTO_TEST_CASE_WITH_THF(EqualSimple, EqualSimpleTest) ARMNN_AUTO_TEST_CASE_WITH_THF(EqualBroadcast1Element, EqualBroadcast1ElementTest) diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt index 33a18e38da..8953dc9d4d 100644 --- a/src/backends/neon/workloads/CMakeLists.txt +++ b/src/backends/neon/workloads/CMakeLists.txt @@ -56,6 +56,8 @@ list(APPEND armnnNeonBackendWorkloads_sources NeonFullyConnectedWorkload.hpp NeonGatherWorkload.cpp NeonGatherWorkload.hpp + NeonGatherNdWorkload.cpp + NeonGatherNdWorkload.hpp NeonInstanceNormalizationWorkload.cpp NeonInstanceNormalizationWorkload.hpp NeonL2NormalizationFloatWorkload.cpp diff --git a/src/backends/neon/workloads/NeonGatherNdWorkload.cpp b/src/backends/neon/workloads/NeonGatherNdWorkload.cpp new file mode 100644 index 0000000000..00c66cf9be --- /dev/null +++ b/src/backends/neon/workloads/NeonGatherNdWorkload.cpp @@ -0,0 +1,147 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "NeonGatherNdWorkload.hpp" +#include "NeonWorkloadUtils.hpp" +#include <armnn/utility/PolymorphicDowncast.hpp> +#include <aclCommon/ArmComputeUtils.hpp> +#include "backendsCommon/WorkloadUtils.hpp" + +namespace armnn +{ +arm_compute::Status NeonGatherNdWorkloadValidate(const TensorInfo& paramInfo, + const TensorInfo& indicesInfo, + const TensorInfo& outputInfo) +{ + // Calculate ND, K, W, C. + std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(paramInfo, indicesInfo); + + /// Call Gather with adequate shapes + // Reshape params into { K, C } + armnn::TensorInfo params_K_C_Info = paramInfo; + params_K_C_Info.SetShape({ keyIndices["K"], keyIndices["C"] }); + + // Reshape indices into { W } + armnn::TensorInfo indices_W_Info = indicesInfo; + indices_W_Info.SetShape({ keyIndices["W"] }); + + // Reshape output to have the shape given by gather { W, C } + // (the original outputInfo has the shape given by gatherNd) + armnn::TensorInfo outputGather_Info = outputInfo; + outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] }); + + const arm_compute::TensorInfo aclParamsInfo = BuildArmComputeTensorInfo(params_K_C_Info); + const arm_compute::TensorInfo aclIndicesInfo = BuildArmComputeTensorInfo(indices_W_Info); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(outputGather_Info); + + auto aclAxis = ComputeAclAxis(0, params_K_C_Info); + return arm_compute::NEGather::validate(&aclParamsInfo, &aclIndicesInfo, &aclOutputInfo, aclAxis); +} + +NeonGatherNdWorkload::NeonGatherNdWorkload(const GatherNdQueueDescriptor& descriptor, + const WorkloadInfo& info) + : NeonBaseWorkload<GatherNdQueueDescriptor>(descriptor, info) +{ + m_Data.ValidateInputsOutputs("NeonGatherNdWorkload", 2, 1); + + TensorInfo paramsInfo = info.m_InputTensorInfos[0]; + TensorInfo indicesInfo = info.m_InputTensorInfos[1]; + TensorInfo outputInfo = info.m_OutputTensorInfos[0]; + + arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ITensor& indices = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[1])->GetTensor(); + arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + + // Calculate ND, K, W, C. + std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(paramsInfo, indicesInfo); + + /// Calculate flattened indices: m_FlattenedIndices = indices * m_FlattenedCoeff. + /// This could be done using MatMul instead of multiplication followed by reduce sum operation, + /// but GeMM does not support s32 at the moment. + + // Prepare the tensor to store the output of the reduce_sum operation + armnn::TensorInfo flattenedIndices_Info = indicesInfo; + flattenedIndices_Info.SetShape({ keyIndices["W"] }); + BuildArmComputeTensor(m_FlattenedIndices, flattenedIndices_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedIndices); + + // Reshape indices into { W, ND } + indices.info()->set_tensor_shape(BuildArmComputeTensorShape({ keyIndices["W"], keyIndices["ND"] })); + + // Calculate the m_FlattenedCoeff + TensorShape paramsShape = paramsInfo.GetShape(); + std::vector<unsigned int> flattenedCoeff(keyIndices["ND"], 1); + for (unsigned int i = 1; i < keyIndices["ND"]; ++i) + { + flattenedCoeff[i - 1] = paramsShape[i]; + } + for (unsigned int i = keyIndices["ND"] - 1; i > 0; --i) + { + flattenedCoeff[i - 1] *= flattenedCoeff[i]; + } + armnn::TensorInfo flattenedCoeff_Info = indicesInfo; + flattenedCoeff_Info.SetShape({ keyIndices["ND"] }); + BuildArmComputeTensor(m_FlattenedCoeff, flattenedCoeff_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedCoeff); + CopyArmComputeITensorData(flattenedCoeff.data(), m_FlattenedCoeff); + + // Prepare the tensor to store the output of the multiplication + armnn::TensorInfo outputMul_Info = indicesInfo; + outputMul_Info.SetShape({ keyIndices["W"], keyIndices["ND"] }); + BuildArmComputeTensor(m_outputMul, outputMul_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_outputMul); + + // Multiply + auto convertPolicy = (IsQuantizedType(info.m_InputTensorInfos[0].GetDataType()) || + IsQuantizedType(info.m_InputTensorInfos[1].GetDataType())) ? + arm_compute::ConvertPolicy::SATURATE : + arm_compute::ConvertPolicy::WRAP; + + m_MulLayer.configure(&indices, + &m_FlattenedCoeff, + &m_outputMul, + 1.0f, + convertPolicy, + arm_compute::RoundingPolicy::TO_ZERO, + arm_compute::ActivationLayerInfo()); + + // Reduce Sum + const std::vector<unsigned int> armnnReduceAxes(1, 1); + arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(m_outputMul.info()->num_dimensions(), + outputMul_Info.GetNumDimensions(), + armnnReduceAxes); + m_ReduceSumLayer.configure(&m_outputMul, + &m_FlattenedIndices, + static_cast<unsigned int>(coords[0]), + arm_compute::ReductionOperation::SUM, + false); + + /// Call Gather with adequate shapes + // Reshape params into { K, C } + paramsInfo.SetShape({ keyIndices["K"], keyIndices["C"] }); + input.info()->set_tensor_shape(BuildArmComputeTensorShape(paramsInfo.GetShape())); + + // Reshape output to have the shape given by gather { W, C } + // (the original outputInfo has the shape given by gatherNd) + armnn::TensorInfo outputGather_Info = outputInfo; + outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] }); + BuildArmComputeTensor(m_outputGather, outputGather_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_outputGather); + + m_GatherLayer.configure(&input, &m_FlattenedIndices, &m_outputGather, ComputeAclAxis(0, paramsInfo)); + + // Reshape output to the original output shape + m_ReshapeLayer.configure(&m_outputGather, &output); +} + +void NeonGatherNdWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonGatherNdWorkload_Execute", this->GetGuid()); + m_MulLayer.run(); + m_ReduceSumLayer.run(); + m_GatherLayer.run(); + m_ReshapeLayer.run(); +} +} //namespace armnn
\ No newline at end of file diff --git a/src/backends/neon/workloads/NeonGatherNdWorkload.hpp b/src/backends/neon/workloads/NeonGatherNdWorkload.hpp new file mode 100644 index 0000000000..848aac667b --- /dev/null +++ b/src/backends/neon/workloads/NeonGatherNdWorkload.hpp @@ -0,0 +1,41 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "NeonBaseWorkload.hpp" + +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/NEON/functions/NEGather.h" +#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace armnn +{ +arm_compute::Status NeonGatherNdWorkloadValidate(const TensorInfo& input, + const TensorInfo& indices, + const TensorInfo& output); + +class NeonGatherNdWorkload : public NeonBaseWorkload<GatherNdQueueDescriptor> +{ +public: + NeonGatherNdWorkload(const GatherNdQueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + arm_compute::Tensor m_FlattenedCoeff; + arm_compute::Tensor m_outputMul; + arm_compute::Tensor m_FlattenedIndices; + arm_compute::Tensor m_outputGather; + + mutable arm_compute::NEPixelWiseMultiplication m_MulLayer; + mutable arm_compute::NEReductionOperation m_ReduceSumLayer; + mutable arm_compute::NEGather m_GatherLayer; + mutable arm_compute::NEReshapeLayer m_ReshapeLayer; + +}; + +} //namespace armnn
\ No newline at end of file diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp index 8b99f03a7f..024f1ca983 100644 --- a/src/backends/neon/workloads/NeonWorkloads.hpp +++ b/src/backends/neon/workloads/NeonWorkloads.hpp @@ -31,6 +31,7 @@ #include "NeonFloorFloatWorkload.hpp" #include "NeonFullyConnectedWorkload.hpp" #include "NeonGatherWorkload.hpp" +#include "NeonGatherNdWorkload.hpp" #include "NeonInstanceNormalizationWorkload.hpp" #include "NeonL2NormalizationFloatWorkload.hpp" #include "NeonLogWorkload.hpp" |