IVGCVSW-6862 Add GATHERNd Neon workload

* Changing the test in the delegate to match one of the unit tests Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com> Change-Id: I553ca266116ba8ee173fc951ab1ffd2b6eed1428
author: Teresa Charlin <teresa.charlinreyes@arm.com> 2022-04-26 18:14:12 +0100
committer: TeresaARM <teresa.charlinreyes@arm.com> 2022-05-05 08:49:41 +0000
commit: bd22c7d8d71bb9d6fdebcd07a472d66c7616abad (patch)
tree: 87a132055db845fab901e18fcb6edd6998e33f3f
parent: 1299496996bc332f02218f926640a9255ed60310 (diff)
download: armnn-bd22c7d8d71bb9d6fdebcd07a472d66c7616abad.tar.gz
11 files changed, 254 insertions, 17 deletions
diff --git a/delegate/src/test/GatherNdTest.cpp b/delegate/src/test/GatherNdTest.cpp
index b56a931d27..2b4fd4207e 100644
--- a/delegate/src/test/GatherNdTest.cpp
+++ b/delegate/src/test/GatherNdTest.cpp
@@ -19,13 +19,13 @@ namespace armnnDelegate
 void GatherNdUint8Test(std::vector<armnn::BackendId>& backends)
 {
 
-    std::vector<int32_t> paramsShape{8};
-    std::vector<int32_t> indicesShape{3,1};
-    std::vector<int32_t> expectedOutputShape{3};
+    std::vector<int32_t> paramsShape{ 5, 2 };
+    std::vector<int32_t> indicesShape{ 3, 1 };
+    std::vector<int32_t> expectedOutputShape{ 3, 2 };
 
-    std::vector<uint8_t> paramsValues{1, 2, 3, 4, 5, 6, 7, 8};
-    std::vector<int32_t> indicesValues{7, 6, 5};
-    std::vector<uint8_t> expectedOutputValues{8, 7, 6};
+    std::vector<uint8_t> paramsValues{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    std::vector<int32_t> indicesValues{ 1, 0, 4 };
+    std::vector<uint8_t> expectedOutputValues{ 3, 4, 1, 2, 9, 10 };
 
     GatherNdTest<uint8_t>(::tflite::TensorType_UINT8,
                           backends,
@@ -39,13 +39,13 @@ void GatherNdUint8Test(std::vector<armnn::BackendId>& backends)
 
 void GatherNdFp32Test(std::vector<armnn::BackendId>& backends)
 {
-    std::vector<int32_t> paramsShape{8};
-    std::vector<int32_t> indicesShape{3,1};
-    std::vector<int32_t> expectedOutputShape{3};
+    std::vector<int32_t> paramsShape{ 5, 2 };
+    std::vector<int32_t> indicesShape{ 3, 1 };
+    std::vector<int32_t> expectedOutputShape{ 3, 2 };
 
-    std::vector<float>   paramsValues{1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f};
-    std::vector<int32_t> indicesValues{7, 6, 5};
-    std::vector<float>   expectedOutputValues{8.8f, 7.7f, 6.6f};
+    std::vector<float>   paramsValues{ 1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f, 9.9f, 10.10f };
+    std::vector<int32_t> indicesValues{ 1, 0, 4 };
+    std::vector<float>   expectedOutputValues{ 3.3f, 4.4f, 1.1f, 2.2f, 9.9f, 10.10f };
 
     GatherNdTest<float>(::tflite::TensorType_FLOAT32,
                         backends,
diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox
index b29d56f6b5..a02b4da2ff 100644
--- a/docs/02_operator_list.dox
+++ b/docs/02_operator_list.dox
@@ -1481,13 +1481,19 @@ where N = batches, C = channels, H = height, W = width
   <td>CpuAcc
   <td>
       <ul>
-       <li>TBD
+       <li>All
       </ul>
   <td>
-    <table>
-    <tr><th>
-    <tr><td>TBD
-    </table>
+      <table>
+       <tr><th>
+       <tr><td>BFLOAT16
+       <tr><td>FLOAT16
+       <tr><td>FLOAT32
+       <tr><td>QASYMMS8
+       <tr><td>QASYMMU8
+       <tr><td>QSYMMS16
+       <tr><td>SIGNED32
+      </table>
 <tr>
   <td>GpuAcc
   <td>
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 210535536e..26b650b49d 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -54,6 +54,7 @@
 #include "workloads/NeonNormalizationFloatWorkload.hpp"
 #include "workloads/NeonFullyConnectedWorkload.hpp"
 #include "workloads/NeonGatherWorkload.hpp"
+#include "workloads/NeonGatherNdWorkload.hpp"
 #include "workloads/NeonPadWorkload.hpp"
 #include "workloads/NeonPermuteWorkload.hpp"
 #include "workloads/NeonPooling2dWorkload.hpp"
@@ -349,6 +350,11 @@ bool NeonLayerSupport::IsLayerSupported(const LayerType& type,
                                      infos[2],
                                      *(PolymorphicDowncast<const GatherDescriptor*>(&descriptor)),
                                      reasonIfUnsupported);
+        case LayerType::GatherNd:
+            return IsGatherNdSupported(infos[0],
+                                       infos[1],
+                                       infos[2],
+                                       reasonIfUnsupported);
         case LayerType::Input:
             return IsInputSupported(infos[0], reasonIfUnsupported);
         case LayerType::InstanceNormalization:
@@ -998,6 +1004,18 @@ bool NeonLayerSupport::IsGatherSupported(const TensorInfo& input0,
                                    descriptor);
 }
 
+bool NeonLayerSupport::IsGatherNdSupported(const TensorInfo& input0,
+                                           const TensorInfo& input1,
+                                           const TensorInfo& output,
+                                           Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonGatherNdWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input0,
+                                   input1,
+                                   output);
+}
+
 bool NeonLayerSupport::IsInputSupported(const TensorInfo& input,
                                         Optional<std::string&> reasonIfUnsupported) const
 {
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index 511bb035d2..b82351872b 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -153,6 +153,11 @@ public:
                                    const FullyConnectedDescriptor& descriptor,
                                    Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsGatherNdSupported(const TensorInfo& input0,
+                             const TensorInfo& input1,
+                             const TensorInfo& output,
+                             Optional<std::string&> reasonIfUnsupported) const;
+
     bool IsGatherSupported(const TensorInfo& input0,
                            const TensorInfo& input1,
                            const TensorInfo& output,
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index c83e8b3e6d..cbed690733 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -346,6 +346,11 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateWorkload(LayerType type,
             auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor);
             return std::make_unique<NeonGatherWorkload>(*gatherQueueDescriptor, info);
         }
+        case LayerType::GatherNd :
+        {
+            auto gatherNdQueueDescriptor = PolymorphicDowncast<const GatherNdQueueDescriptor*>(&descriptor);
+            return std::make_unique<NeonGatherNdWorkload>(*gatherNdQueueDescriptor, info);
+        }
         case LayerType::Input :
         {
             auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor);
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 0d6fd8035c..c522df6bad 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -48,6 +48,7 @@ BACKEND_SOURCES := \
         workloads/NeonFloorFloatWorkload.cpp \
         workloads/NeonFullyConnectedWorkload.cpp \
         workloads/NeonGatherWorkload.cpp \
+        workloads/NeonGatherNdWorkload.cpp \
         workloads/NeonInstanceNormalizationWorkload.cpp \
         workloads/NeonL2NormalizationFloatWorkload.cpp \
         workloads/NeonLogWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 1ac6c78ec7..e0811022f9 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -781,6 +781,17 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(Gather1dParamsUint8, Gather1dParamsUint8Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(GatherMultiDimParamsFloat32, GatherMultiDimParamsFloat32Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(GatherMultiDimParamsUint8, GatherMultiDimParamsUint8Test)
 
+// GatherNd
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dFloat32, SimpleGatherNd2dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dFloat32, SimpleGatherNd3dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dFloat32, SimpleGatherNd4dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dInt8, SimpleGatherNd2dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dInt8, SimpleGatherNd3dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dInt8, SimpleGatherNd4dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dInt32, SimpleGatherNd2dTest<DataType::Signed32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dInt32, SimpleGatherNd3dTest<DataType::Signed32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dInt32, SimpleGatherNd4dTest<DataType::Signed32>)
+
 // Equal
 ARMNN_AUTO_TEST_CASE_WITH_THF(EqualSimple,            EqualSimpleTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(EqualBroadcast1Element, EqualBroadcast1ElementTest)
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index 33a18e38da..8953dc9d4d 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -56,6 +56,8 @@ list(APPEND armnnNeonBackendWorkloads_sources
     NeonFullyConnectedWorkload.hpp
     NeonGatherWorkload.cpp
     NeonGatherWorkload.hpp
+    NeonGatherNdWorkload.cpp
+    NeonGatherNdWorkload.hpp
     NeonInstanceNormalizationWorkload.cpp
     NeonInstanceNormalizationWorkload.hpp
     NeonL2NormalizationFloatWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonGatherNdWorkload.cpp b/src/backends/neon/workloads/NeonGatherNdWorkload.cpp
new file mode 100644
index 0000000000..00c66cf9be
--- /dev/null
+++ b/src/backends/neon/workloads/NeonGatherNdWorkload.cpp
@@ -0,0 +1,147 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonGatherNdWorkload.hpp"
+#include "NeonWorkloadUtils.hpp"
+#include <armnn/utility/PolymorphicDowncast.hpp>
+#include <aclCommon/ArmComputeUtils.hpp>
+#include "backendsCommon/WorkloadUtils.hpp"
+
+namespace armnn
+{
+arm_compute::Status NeonGatherNdWorkloadValidate(const TensorInfo& paramInfo,
+                                                 const TensorInfo& indicesInfo,
+                                                 const TensorInfo& outputInfo)
+{
+    // Calculate ND, K, W, C.
+    std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(paramInfo, indicesInfo);
+
+    /// Call Gather with adequate shapes
+    // Reshape params into { K, C }
+    armnn::TensorInfo params_K_C_Info =  paramInfo;
+    params_K_C_Info.SetShape({ keyIndices["K"], keyIndices["C"] });
+
+    // Reshape indices into { W }
+    armnn::TensorInfo indices_W_Info = indicesInfo;
+    indices_W_Info.SetShape({ keyIndices["W"] });
+
+    // Reshape output to have the shape given by gather { W, C }
+    // (the original outputInfo has the shape given by gatherNd)
+    armnn::TensorInfo outputGather_Info = outputInfo;
+    outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] });
+
+    const arm_compute::TensorInfo aclParamsInfo  = BuildArmComputeTensorInfo(params_K_C_Info);
+    const arm_compute::TensorInfo aclIndicesInfo = BuildArmComputeTensorInfo(indices_W_Info);
+    const arm_compute::TensorInfo aclOutputInfo  = BuildArmComputeTensorInfo(outputGather_Info);
+
+    auto aclAxis = ComputeAclAxis(0, params_K_C_Info);
+    return arm_compute::NEGather::validate(&aclParamsInfo, &aclIndicesInfo, &aclOutputInfo, aclAxis);
+}
+
+NeonGatherNdWorkload::NeonGatherNdWorkload(const GatherNdQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info)
+        : NeonBaseWorkload<GatherNdQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs("NeonGatherNdWorkload", 2, 1);
+
+    TensorInfo paramsInfo  = info.m_InputTensorInfos[0];
+    TensorInfo indicesInfo = info.m_InputTensorInfos[1];
+    TensorInfo outputInfo  = info.m_OutputTensorInfos[0];
+
+    arm_compute::ITensor& input   = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& indices = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+    arm_compute::ITensor& output  = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    // Calculate ND, K, W, C.
+    std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(paramsInfo, indicesInfo);
+
+    /// Calculate flattened indices: m_FlattenedIndices = indices * m_FlattenedCoeff.
+    /// This could be done using MatMul instead of multiplication followed by reduce sum operation,
+    /// but GeMM does not support s32 at the moment.
+
+    // Prepare the tensor to store the output of the reduce_sum operation
+    armnn::TensorInfo flattenedIndices_Info = indicesInfo;
+    flattenedIndices_Info.SetShape({ keyIndices["W"] });
+    BuildArmComputeTensor(m_FlattenedIndices, flattenedIndices_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedIndices);
+
+    // Reshape indices into { W, ND }
+    indices.info()->set_tensor_shape(BuildArmComputeTensorShape({ keyIndices["W"], keyIndices["ND"] }));
+
+    // Calculate the m_FlattenedCoeff
+    TensorShape paramsShape = paramsInfo.GetShape();
+    std::vector<unsigned int> flattenedCoeff(keyIndices["ND"], 1);
+    for (unsigned int i = 1; i < keyIndices["ND"]; ++i)
+    {
+        flattenedCoeff[i - 1] = paramsShape[i];
+    }
+    for (unsigned int i = keyIndices["ND"] - 1; i > 0; --i)
+    {
+        flattenedCoeff[i - 1] *= flattenedCoeff[i];
+    }
+    armnn::TensorInfo flattenedCoeff_Info = indicesInfo;
+    flattenedCoeff_Info.SetShape({ keyIndices["ND"] });
+    BuildArmComputeTensor(m_FlattenedCoeff, flattenedCoeff_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedCoeff);
+    CopyArmComputeITensorData(flattenedCoeff.data(), m_FlattenedCoeff);
+
+    // Prepare the tensor to store the output of the multiplication
+    armnn::TensorInfo outputMul_Info = indicesInfo;
+    outputMul_Info.SetShape({ keyIndices["W"], keyIndices["ND"] });
+    BuildArmComputeTensor(m_outputMul, outputMul_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_outputMul);
+
+    // Multiply
+    auto convertPolicy = (IsQuantizedType(info.m_InputTensorInfos[0].GetDataType()) ||
+                          IsQuantizedType(info.m_InputTensorInfos[1].GetDataType())) ?
+                          arm_compute::ConvertPolicy::SATURATE :
+                          arm_compute::ConvertPolicy::WRAP;
+
+    m_MulLayer.configure(&indices,
+                         &m_FlattenedCoeff,
+                         &m_outputMul,
+                         1.0f,
+                         convertPolicy,
+                         arm_compute::RoundingPolicy::TO_ZERO,
+                         arm_compute::ActivationLayerInfo());
+
+    // Reduce Sum
+    const std::vector<unsigned int> armnnReduceAxes(1, 1);
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(m_outputMul.info()->num_dimensions(),
+                                                                          outputMul_Info.GetNumDimensions(),
+                                                                          armnnReduceAxes);
+    m_ReduceSumLayer.configure(&m_outputMul,
+                               &m_FlattenedIndices,
+                               static_cast<unsigned int>(coords[0]),
+                               arm_compute::ReductionOperation::SUM,
+                               false);
+
+    /// Call Gather with adequate shapes
+    // Reshape params into { K, C }
+    paramsInfo.SetShape({ keyIndices["K"], keyIndices["C"] });
+    input.info()->set_tensor_shape(BuildArmComputeTensorShape(paramsInfo.GetShape()));
+
+    // Reshape output to have the shape given by gather { W, C }
+    // (the original outputInfo has the shape given by gatherNd)
+    armnn::TensorInfo outputGather_Info = outputInfo;
+    outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] });
+    BuildArmComputeTensor(m_outputGather, outputGather_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_outputGather);
+
+    m_GatherLayer.configure(&input, &m_FlattenedIndices, &m_outputGather, ComputeAclAxis(0, paramsInfo));
+
+    // Reshape output to the original output shape
+    m_ReshapeLayer.configure(&m_outputGather, &output);
+}
+
+void NeonGatherNdWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonGatherNdWorkload_Execute", this->GetGuid());
+    m_MulLayer.run();
+    m_ReduceSumLayer.run();
+    m_GatherLayer.run();
+    m_ReshapeLayer.run();
+}
+} //namespace armnn
+\ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonGatherNdWorkload.hpp b/src/backends/neon/workloads/NeonGatherNdWorkload.hpp
new file mode 100644
index 0000000000..848aac667b
--- /dev/null
+++ b/src/backends/neon/workloads/NeonGatherNdWorkload.hpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseWorkload.hpp"
+
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace armnn
+{
+arm_compute::Status NeonGatherNdWorkloadValidate(const TensorInfo& input,
+                                                 const TensorInfo& indices,
+                                                 const TensorInfo& output);
+
+class NeonGatherNdWorkload : public NeonBaseWorkload<GatherNdQueueDescriptor>
+{
+public:
+    NeonGatherNdWorkload(const GatherNdQueueDescriptor& descriptor, const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    arm_compute::Tensor m_FlattenedCoeff;
+    arm_compute::Tensor m_outputMul;
+    arm_compute::Tensor m_FlattenedIndices;
+    arm_compute::Tensor m_outputGather;
+
+    mutable arm_compute::NEPixelWiseMultiplication m_MulLayer;
+    mutable arm_compute::NEReductionOperation m_ReduceSumLayer;
+    mutable arm_compute::NEGather m_GatherLayer;
+    mutable arm_compute::NEReshapeLayer m_ReshapeLayer;
+
+};
+
+} //namespace armnn
+\ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 8b99f03a7f..024f1ca983 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -31,6 +31,7 @@
 #include "NeonFloorFloatWorkload.hpp"
 #include "NeonFullyConnectedWorkload.hpp"
 #include "NeonGatherWorkload.hpp"
+#include "NeonGatherNdWorkload.hpp"
 #include "NeonInstanceNormalizationWorkload.hpp"
 #include "NeonL2NormalizationFloatWorkload.hpp"
 #include "NeonLogWorkload.hpp"
author	Teresa Charlin <teresa.charlinreyes@arm.com>	2022-04-26 18:14:12 +0100
committer	TeresaARM <teresa.charlinreyes@arm.com>	2022-05-05 08:49:41 +0000
commit	bd22c7d8d71bb9d6fdebcd07a472d66c7616abad (patch)
tree	87a132055db845fab901e18fcb6edd6998e33f3f
parent	1299496996bc332f02218f926640a9255ed60310 (diff)
download	armnn-bd22c7d8d71bb9d6fdebcd07a472d66c7616abad.tar.gz