From 989e2f6c71b979f6aaf3c653808a7893fb0dd1c3 Mon Sep 17 00:00:00 2001 From: Teresa Charlin Date: Wed, 27 Apr 2022 16:26:11 +0100 Subject: IVGCVSW-6861 Add GATHERNd CL workload Signed-off-by: Teresa Charlin Change-Id: I8ba7e56062c285c672dcaa9d13be319eb4f1fca6 --- docs/02_operator_list.dox | 16 +- src/backends/cl/ClLayerSupport.cpp | 18 ++ src/backends/cl/ClLayerSupport.hpp | 5 + src/backends/cl/ClWorkloadFactory.cpp | 5 + src/backends/cl/backend.mk | 1 + src/backends/cl/test/ClLayerTests.cpp | 11 ++ src/backends/cl/workloads/CMakeLists.txt | 2 + src/backends/cl/workloads/ClGatherNdWorkload.cpp | 206 +++++++++++++++++++++++ src/backends/cl/workloads/ClGatherNdWorkload.hpp | 42 +++++ src/backends/cl/workloads/ClWorkloads.hpp | 1 + 10 files changed, 302 insertions(+), 5 deletions(-) create mode 100644 src/backends/cl/workloads/ClGatherNdWorkload.cpp create mode 100644 src/backends/cl/workloads/ClGatherNdWorkload.hpp diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox index a02b4da2ff..960428999e 100644 --- a/docs/02_operator_list.dox +++ b/docs/02_operator_list.dox @@ -1498,13 +1498,19 @@ where N = batches, C = channels, H = height, W = width GpuAcc - -
-
TBD -
+ +
+
BFLOAT16 +
FLOAT16 +
FLOAT32 +
QASYMMS8 +
QASYMMU8 +
QSYMMS16 +
SIGNED32 +
InputLayer Special layer used to provide input data to the computational network. diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp index 6b8cf52d87..9c40391f1a 100644 --- a/src/backends/cl/ClLayerSupport.cpp +++ b/src/backends/cl/ClLayerSupport.cpp @@ -41,6 +41,7 @@ #include "workloads/ClFloorFloatWorkload.hpp" #include "workloads/ClFullyConnectedWorkload.hpp" #include "workloads/ClGatherWorkload.hpp" +#include "workloads/ClGatherNdWorkload.hpp" #include "workloads/ClInstanceNormalizationWorkload.hpp" #include "workloads/ClL2NormalizationFloatWorkload.hpp" #include "workloads/ClLogWorkload.hpp" @@ -372,6 +373,11 @@ bool ClLayerSupport::IsLayerSupported(const LayerType& type, infos[2], *(PolymorphicDowncast(&descriptor)), reasonIfUnsupported); + case LayerType::GatherNd: + return IsGatherNdSupported(infos[0], + infos[1], + infos[2], + reasonIfUnsupported); case LayerType::Input: return IsInputSupported(infos[0], reasonIfUnsupported); case LayerType::InstanceNormalization: @@ -1021,6 +1027,18 @@ bool ClLayerSupport::IsGatherSupported(const TensorInfo& input0, descriptor); } +bool ClLayerSupport::IsGatherNdSupported(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + Optional reasonIfUnsupported) const +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClGatherNdWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); +} + bool ClLayerSupport::IsInputSupported(const TensorInfo& input, Optional reasonIfUnsupported) const { diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp index 4f4e64e113..27311f74aa 100644 --- a/src/backends/cl/ClLayerSupport.hpp +++ b/src/backends/cl/ClLayerSupport.hpp @@ -148,6 +148,11 @@ public: const FullyConnectedDescriptor& descriptor, Optional reasonIfUnsupported = EmptyOptional()) const override; + bool IsGatherNdSupported(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + Optional reasonIfUnsupported) const; + bool IsGatherSupported(const TensorInfo& input0, const TensorInfo& input1, const TensorInfo& output, diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp index 213f474a97..d4a1cb081d 100644 --- a/src/backends/cl/ClWorkloadFactory.cpp +++ b/src/backends/cl/ClWorkloadFactory.cpp @@ -463,6 +463,11 @@ std::unique_ptr ClWorkloadFactory::CreateWorkload(LayerType type, auto gatherQueueDescriptor = PolymorphicDowncast(&descriptor); return MakeWorkload(*gatherQueueDescriptor, info, m_CLCompileContext); } + case LayerType::GatherNd : + { + auto gatherNdQueueDescriptor = PolymorphicDowncast(&descriptor); + return MakeWorkload(*gatherNdQueueDescriptor, info, m_CLCompileContext); + } case LayerType::Input : { auto inputQueueDescriptor = PolymorphicDowncast(&descriptor); diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk index bf9689a6e9..6fda16db05 100644 --- a/src/backends/cl/backend.mk +++ b/src/backends/cl/backend.mk @@ -50,6 +50,7 @@ BACKEND_SOURCES := \ workloads/ClFloorFloatWorkload.cpp \ workloads/ClFullyConnectedWorkload.cpp \ workloads/ClGatherWorkload.cpp \ + workloads/ClGatherNdWorkload.cpp \ workloads/ClInstanceNormalizationWorkload.cpp \ workloads/ClL2NormalizationFloatWorkload.cpp \ workloads/ClLogWorkload.cpp \ diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp index fd24043405..de39f986e0 100644 --- a/src/backends/cl/test/ClLayerTests.cpp +++ b/src/backends/cl/test/ClLayerTests.cpp @@ -1018,6 +1018,17 @@ ARMNN_AUTO_TEST_FIXTURE_WITH_THF(Gather1dParamsUint8, ClContextControlFixture, G ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherMultiDimParamsFloat32, ClContextControlFixture, GatherMultiDimParamsFloat32Test) ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherMultiDimParamsUint8, ClContextControlFixture, GatherMultiDimParamsUint8Test) +// GatherNd +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd2dFloat32, ClContextControlFixture, SimpleGatherNd2dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd3dFloat32, ClContextControlFixture, SimpleGatherNd3dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd4dFloat32, ClContextControlFixture, SimpleGatherNd4dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd2dInt8, ClContextControlFixture, SimpleGatherNd2dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd3dInt8, ClContextControlFixture, SimpleGatherNd3dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd4dInt8, ClContextControlFixture, SimpleGatherNd4dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd2dInt32, ClContextControlFixture, SimpleGatherNd2dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd3dInt32, ClContextControlFixture, SimpleGatherNd3dTest) +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd4dInt32, ClContextControlFixture, SimpleGatherNd4dTest) + // Reshape ARMNN_AUTO_TEST_FIXTURE_WITH_THF(SimpleReshapeFloat32, ClContextControlFixture, SimpleReshapeTest) ARMNN_AUTO_TEST_FIXTURE_WITH_THF(SimpleReshapeInt8, ClContextControlFixture, SimpleReshapeTest) diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt index 59e11cdf9f..aef7fc7ad2 100644 --- a/src/backends/cl/workloads/CMakeLists.txt +++ b/src/backends/cl/workloads/CMakeLists.txt @@ -52,6 +52,8 @@ list(APPEND armnnClBackendWorkloads_sources ClFullyConnectedWorkload.hpp ClGatherWorkload.cpp ClGatherWorkload.hpp + ClGatherNdWorkload.cpp + ClGatherNdWorkload.hpp ClInstanceNormalizationWorkload.cpp ClInstanceNormalizationWorkload.hpp ClLogWorkload.cpp diff --git a/src/backends/cl/workloads/ClGatherNdWorkload.cpp b/src/backends/cl/workloads/ClGatherNdWorkload.cpp new file mode 100644 index 0000000000..f68914645e --- /dev/null +++ b/src/backends/cl/workloads/ClGatherNdWorkload.cpp @@ -0,0 +1,206 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "ClGatherNdWorkload.hpp" +#include "ClWorkloadUtils.hpp" +#include "backendsCommon/WorkloadUtils.hpp" +#include +#include + +using namespace armnn::armcomputetensorutils; + +namespace armnn +{ +arm_compute::Status ClGatherNdWorkloadValidate(const TensorInfo& paramsInfo, + const TensorInfo& indicesInfo, + const TensorInfo& outputInfo) +{ + // Calculate ND, K, W, C. + std::map keyIndices = CalculateGatherNdKeyIndices(paramsInfo, indicesInfo); + + /// Validate Mul + // Indices with shape { W, ND } + armnn::TensorInfo indices_W_ND_Info = indicesInfo; + indices_W_ND_Info.SetShape({ keyIndices["W"], keyIndices["ND"] }); + const arm_compute::TensorInfo aclIndicesInfo = BuildArmComputeTensorInfo(indices_W_ND_Info); + + // Flattened coefficients with shape { ND } + armnn::TensorInfo flattenedCoeff_Info = indicesInfo; + flattenedCoeff_Info.SetShape({ keyIndices["ND"] }); + const arm_compute::TensorInfo aclFlattenedCoeffInfo = BuildArmComputeTensorInfo(flattenedCoeff_Info); + + // Output of Mul with shape { W, ND } + const arm_compute::TensorInfo aclOutputMulInfo = BuildArmComputeTensorInfo(indices_W_ND_Info); + + auto statusMul = arm_compute::CLPixelWiseMultiplication::validate(&aclIndicesInfo, + &aclFlattenedCoeffInfo, + &aclOutputMulInfo, + 1.0f, + arm_compute::ConvertPolicy::WRAP, + arm_compute::RoundingPolicy::TO_ZERO, + arm_compute::ActivationLayerInfo()); + + /// Validate ReduceSum + // Flattened indices with shape { W } + armnn::TensorInfo flattenedIndices_Info = indicesInfo; + flattenedIndices_Info.SetShape({ keyIndices["W"] }); + const arm_compute::TensorInfo aclFlattenedIndicesInfo = BuildArmComputeTensorInfo(flattenedIndices_Info); + + const std::vector armnnReduceAxes(1, 1); + arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclOutputMulInfo.num_dimensions(), + indices_W_ND_Info.GetNumDimensions(), + armnnReduceAxes); + + auto statusReduceSum = arm_compute::CLReductionOperation::validate(&aclOutputMulInfo, + &aclFlattenedIndicesInfo, + static_cast(coords[0]), + arm_compute::ReductionOperation::SUM, + false); + + /// Validate Gather + // Params with shape { K, C } + armnn::TensorInfo params_K_C_Info = paramsInfo; + params_K_C_Info.SetShape({ keyIndices["K"], keyIndices["C"] }); + const arm_compute::TensorInfo aclParamsInfo = BuildArmComputeTensorInfo(params_K_C_Info); + + // Output of gather with shape { W, C } + armnn::TensorInfo outputGather_Info = outputInfo; + outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] }); + const arm_compute::TensorInfo aclOutputGatherInfo = BuildArmComputeTensorInfo(outputGather_Info); + + auto aclAxis = ComputeAclAxis(0, params_K_C_Info); + auto statusGather = + arm_compute::CLGather::validate(&aclParamsInfo, &aclFlattenedIndicesInfo, &aclOutputGatherInfo, aclAxis); + + /// Validate Reshape + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(outputInfo); + + auto statusReshape = arm_compute::CLReshapeLayer::validate(&aclOutputGatherInfo, &aclOutputInfo); + + /// Return OK if all the layers are valid + auto okCode = arm_compute::ErrorCode::OK; + if (statusMul.error_code() == okCode && + statusReduceSum.error_code() == okCode && + statusGather.error_code() == okCode && + statusReshape.error_code() == okCode) + { + return arm_compute::Status(arm_compute::ErrorCode::OK, + "All GatherND layers validate status OK."); + } + else + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, + "GatherND layer validate status failed."); + } +} + +ClGatherNdWorkload::ClGatherNdWorkload(const GatherNdQueueDescriptor& descriptor, + const WorkloadInfo& info, + const arm_compute::CLCompileContext& clCompileContext) + : ClBaseWorkload(descriptor, info) +{ + m_Data.ValidateInputsOutputs("ClGatherNdWorkload", 2, 1); + + TensorInfo paramsInfo = info.m_InputTensorInfos[0]; + TensorInfo indicesInfo = info.m_InputTensorInfos[1]; + TensorInfo outputInfo = info.m_OutputTensorInfos[0]; + + arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& indices = static_cast(m_Data.m_Inputs[1])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + + // Calculate ND, K, W, C. + std::map keyIndices = CalculateGatherNdKeyIndices(paramsInfo, indicesInfo); + + /// Calculate flattened indices: m_FlattenedIndices = indices * m_FlattenedCoeff. + /// This could be done using MatMul instead of multiplication followed by reduce sum operation, + /// but GeMM does not support s32 at the moment. + + // Prepare the tensor to store the output of the reduce_sum operation + armnn::TensorInfo flattenedIndices_Info = indicesInfo; + flattenedIndices_Info.SetShape({ keyIndices["W"] }); + BuildArmComputeTensor(m_FlattenedIndices, flattenedIndices_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedIndices); + + // Reshape indices into { W, ND } + indices.info()->set_tensor_shape(BuildArmComputeTensorShape({ keyIndices["W"], keyIndices["ND"] })); + + // Calculate the m_FlattenedCoeff + TensorShape paramsShape = paramsInfo.GetShape(); + std::vector flattenedCoeff(keyIndices["ND"], 1); + for (unsigned int i = 1; i < keyIndices["ND"]; ++i) + { + flattenedCoeff[i - 1] = static_cast(paramsShape[i]); + } + for (unsigned int i = keyIndices["ND"] - 1; i > 0; --i) + { + flattenedCoeff[i - 1] *= flattenedCoeff[i]; + } + armnn::TensorInfo flattenedCoeff_Info = indicesInfo; + flattenedCoeff_Info.SetShape({ keyIndices["ND"] }); + BuildArmComputeTensor(m_FlattenedCoeff, flattenedCoeff_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedCoeff); + ARMNN_ASSERT_MSG(indicesInfo.GetDataType() == DataType::Signed32, + "flattenedCoeff must be same data type as m_FlattenedCoeff"); + CopyArmComputeClTensorData(m_FlattenedCoeff, flattenedCoeff.data()); + + // Prepare the tensor to store the output of the multiplication + armnn::TensorInfo outputMul_Info = indicesInfo; + outputMul_Info.SetShape({ keyIndices["W"], keyIndices["ND"] }); + BuildArmComputeTensor(m_OutputMul, outputMul_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_OutputMul); + + // Multiply + m_MulLayer.configure(clCompileContext, + &indices, + &m_FlattenedCoeff, + &m_OutputMul, + 1.0f, + arm_compute::ConvertPolicy::WRAP, + arm_compute::RoundingPolicy::TO_ZERO, + arm_compute::ActivationLayerInfo()); + + // Reduce Sum + const std::vector armnnReduceAxes(1, 1); + arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(m_OutputMul.info()->num_dimensions(), + outputMul_Info.GetNumDimensions(), + armnnReduceAxes); + m_ReduceSumLayer.configure(clCompileContext, + &m_OutputMul, + &m_FlattenedIndices, + static_cast(coords[0]), + arm_compute::ReductionOperation::SUM, + false); + + /// Call Gather with adequate shapes + // Reshape params into { K, C } + paramsInfo.SetShape({ keyIndices["K"], keyIndices["C"] }); + input.info()->set_tensor_shape(BuildArmComputeTensorShape(paramsInfo.GetShape())); + + // Reshape output to have the shape given by gather { W, C } + // (the original outputInfo has the shape given by gatherNd) + armnn::TensorInfo outputGather_Info = outputInfo; + outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] }); + BuildArmComputeTensor(m_OutputGather, outputGather_Info); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_OutputGather); + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClGatherNdWorkload_configure"); + auto aclAxis = ComputeAclAxis(0, paramsInfo); + m_GatherLayer.configure(clCompileContext, &input, &m_FlattenedIndices, &m_OutputGather, aclAxis); + } + + // Reshape output to the original output shape + m_ReshapeLayer.configure(clCompileContext, &m_OutputGather, &output); +}; + +void ClGatherNdWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL_GUID("ClGatherNdWorkload_Execute", this->GetGuid()); + RunClFunction(m_MulLayer, CHECK_LOCATION()); + RunClFunction(m_ReduceSumLayer, CHECK_LOCATION()); + RunClFunction(m_GatherLayer, CHECK_LOCATION()); + RunClFunction(m_ReshapeLayer, CHECK_LOCATION()); +} +} // namespace armnn diff --git a/src/backends/cl/workloads/ClGatherNdWorkload.hpp b/src/backends/cl/workloads/ClGatherNdWorkload.hpp new file mode 100644 index 0000000000..dd30024cc7 --- /dev/null +++ b/src/backends/cl/workloads/ClGatherNdWorkload.hpp @@ -0,0 +1,42 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "ClBaseWorkload.hpp" + +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/CL/functions/CLGather.h" +#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h" +#include "arm_compute/runtime/CL/functions/CLReductionOperation.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" + +namespace armnn +{ +arm_compute::Status ClGatherNdWorkloadValidate(const TensorInfo& params, + const TensorInfo& indices, + const TensorInfo& output); + +class ClGatherNdWorkload : public ClBaseWorkload +{ +public: + ClGatherNdWorkload(const GatherNdQueueDescriptor& descriptor, + const WorkloadInfo& info, + const arm_compute::CLCompileContext& clCompileContext); + virtual void Execute() const override; + +private: + arm_compute::CLTensor m_FlattenedCoeff; + arm_compute::CLTensor m_OutputMul; + arm_compute::CLTensor m_FlattenedIndices; + arm_compute::CLTensor m_OutputGather; + + mutable arm_compute::CLPixelWiseMultiplication m_MulLayer; + mutable arm_compute::CLReductionOperation m_ReduceSumLayer; + mutable arm_compute::CLGather m_GatherLayer; + mutable arm_compute::CLReshapeLayer m_ReshapeLayer; +}; + +} //namespace armnn \ No newline at end of file diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp index 27119bb2dc..71f401ae0e 100644 --- a/src/backends/cl/workloads/ClWorkloads.hpp +++ b/src/backends/cl/workloads/ClWorkloads.hpp @@ -25,6 +25,7 @@ #include "ClFloorFloatWorkload.hpp" #include "ClFullyConnectedWorkload.hpp" #include "ClGatherWorkload.hpp" +#include "ClGatherNdWorkload.hpp" #include "ClInstanceNormalizationWorkload.hpp" #include "ClL2NormalizationFloatWorkload.hpp" #include "ClLogWorkload.hpp" -- cgit v1.2.1