From 0f86ecfce593a302ebd2baf8b70c9f6f50616f81 Mon Sep 17 00:00:00 2001 From: Teresa Charlin Date: Thu, 13 Oct 2022 15:47:08 +0100 Subject: IVGCVSW-6494 Add CpuAcc Batch MatMul Workload Fp32 Signed-off-by: Teresa Charlin Change-Id: I2def6995f81d33e68f1ea45d8d19a1e6294049b1 --- delegate/src/test/BatchMatMulTest.cpp | 16 ++ docs/02_operator_list.dox | 9 +- src/backends/backendsCommon/WorkloadUtils.cpp | 20 +++ src/backends/backendsCommon/WorkloadUtils.hpp | 6 + .../test/layerTests/BatchMatMulTestImpl.cpp | 169 ++---------------- src/backends/neon/NeonLayerSupport.cpp | 21 +++ src/backends/neon/NeonLayerSupport.hpp | 6 + src/backends/neon/NeonWorkloadFactory.cpp | 5 + src/backends/neon/backend.mk | 1 + src/backends/neon/test/NeonLayerTests.cpp | 17 ++ src/backends/neon/workloads/CMakeLists.txt | 2 + .../neon/workloads/NeonBatchMatMulWorkload.cpp | 190 +++++++++++++++++++++ .../neon/workloads/NeonBatchMatMulWorkload.hpp | 41 +++++ src/backends/neon/workloads/NeonWorkloads.hpp | 1 + 14 files changed, 344 insertions(+), 160 deletions(-) create mode 100644 src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp create mode 100644 src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp diff --git a/delegate/src/test/BatchMatMulTest.cpp b/delegate/src/test/BatchMatMulTest.cpp index 5469bc845c..e5cb976c45 100644 --- a/delegate/src/test/BatchMatMulTest.cpp +++ b/delegate/src/test/BatchMatMulTest.cpp @@ -654,4 +654,20 @@ namespace armnnDelegate } } + TEST_SUITE("BATCH_MATMUL_CpuAccTests") + { + TEST_CASE("BATCH_MATMUL_Fp32_CpuAccTests") + { + std::vector backends = {armnn::Compute::CpuAcc}; + BatchMatMul2DFp32SimpleTest (backends); + BatchMatMul3DFp32SimpleTest (backends); + BatchMatMul4DFp32SimpleTest (backends); + BatchMatMul3DFp32BatchTest (backends); + BatchMatMul3DFp32BroadcastTest (backends); + BatchMatMul3D2DFp32BroadcastTest (backends); + BatchMatMul2DFp32TinyTest (backends); + BatchMatMulNonSquareFp32Test (backends); + BatchMatMul2DFp32SimpleAdjointTest(backends); + } + } } diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox index 658aa07d1d..3a902c8883 100644 --- a/docs/02_operator_list.dox +++ b/docs/02_operator_list.dox @@ -293,12 +293,13 @@ where N = batches, C = channels, H = height, W = width CpuAcc
    -
  • N/A +
  • All
-
    -
  • N/A -
+ +
+
FLOAT32 +
GpuAcc diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp index b045530abc..3aea667bfe 100644 --- a/src/backends/backendsCommon/WorkloadUtils.cpp +++ b/src/backends/backendsCommon/WorkloadUtils.cpp @@ -341,4 +341,24 @@ std::map CalculateGatherNdKeyIndices(TensorInfo input return keyIndices; } +armnn::PermutationVector GeneratePermutationVectorOnLastTwoDimensions(unsigned int rank) +{ + armnn::PermutationVector permutationVector{}; + switch (rank) + { + case 2: + permutationVector = {1U, 0U}; + break; + case 3: + permutationVector = {0U, 2U, 1U}; + break; + case 4: + permutationVector = {0U, 1U, 3U, 2U}; + break; + default: + throw Exception("Invalid number of dimensions."); + } + return permutationVector; +} + } // namespace armnn diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp index 0e5487336f..3d8d927345 100644 --- a/src/backends/backendsCommon/WorkloadUtils.hpp +++ b/src/backends/backendsCommon/WorkloadUtils.hpp @@ -258,4 +258,10 @@ std::tuple Convert1HWOtoMIHW(const ConstTensorHandle* /// \return - A map with names and values for N, ND, K, W, C std::map CalculateGatherNdKeyIndices(TensorInfo inputInfo0, TensorInfo inputInfo1); +/// Generates a permutation vector of size rank that permutes the 2 most right dimensions +/// +/// \param rank - Tensor rank, i.e. number of dimensions in the tensors +/// \return - A permutation vector that permutes the 2 last dimensions +armnn::PermutationVector GeneratePermutationVectorOnLastTwoDimensions(unsigned int rank); + } //namespace armnn diff --git a/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp index 6fcc35ab52..74bd97f103 100644 --- a/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp +++ b/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp @@ -71,20 +71,9 @@ LayerTestResult BatchMatMul2DSimpleTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({2,2}, ArmnnType, qScale, qOffset); @@ -160,20 +149,9 @@ LayerTestResult BatchMatMul3DSimpleTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({1,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({1,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({1,2,2}, ArmnnType, qScale, qOffset); @@ -249,20 +227,9 @@ LayerTestResult BatchMatMulNCHWSimpleTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Default arbitrary layout is treated the same as NCHW - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({1,1,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({1,1,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({1,1,2,2}, ArmnnType, qScale, qOffset); @@ -343,20 +310,9 @@ LayerTestResult BatchMatMulNHWCSimpleTest( armnn::DataLayout::NHWC, armnn::DataLayout::NHWC); - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({1,2,2,1}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({1,2,2,1}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({1,2,2,1}, ArmnnType, qScale, qOffset); @@ -432,20 +388,9 @@ LayerTestResult BatchMatMul3DBatchTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({2,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({2,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({2,2,2}, ArmnnType, qScale, qOffset); @@ -530,20 +475,9 @@ LayerTestResult BatchMatMul3DBroadcastTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({2,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({1,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({2,2,2}, ArmnnType, qScale, qOffset); @@ -625,20 +559,9 @@ LayerTestResult BatchMatMul3D2DBroadcastTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({2,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({2,2,2}, ArmnnType, qScale, qOffset); @@ -725,20 +648,9 @@ LayerTestResult BatchMatMulNDHWCNHWCTest( armnn::DataLayout::NDHWC, armnn::DataLayout::NHWC); - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({1,1,2,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({1,2,2,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({1,1,2,2,2}, ArmnnType, qScale, qOffset); @@ -823,20 +735,9 @@ LayerTestResult BatchMatMul2DTinyTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({1,1}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({1,1}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({1,1}, ArmnnType, qScale, qOffset); @@ -909,20 +810,9 @@ LayerTestResult BatchMatMul3DNonSquareTest( { auto descriptor = armnn::BatchMatMulDescriptor(); // Arbitrary layout with no transpose/adjointing - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({2,5,3}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({2,3,4}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({2,5,4}, ArmnnType, qScale, qOffset); @@ -1024,20 +914,9 @@ LayerTestResult BatchMatMul2DTranspSimpleTest( false, false); - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({2,3}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({2,3}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({3,3}, ArmnnType, qScale, qOffset); @@ -1117,20 +996,9 @@ LayerTestResult BatchMatMul2DAdjointSimpleTest( true, false); - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({3,3}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({3,3}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({3,3}, ArmnnType, qScale, qOffset); @@ -1227,20 +1095,9 @@ LayerTestResult BatchMatMulNHWCParamsTest( armnn::DataLayout::NHWC, armnn::DataLayout::NHWC); - float qScale = 0.0f; + float qScale = 1.0f; int32_t qOffset = 0; - switch(ArmnnType) - { - case armnn::DataType::QAsymmS8: - case armnn::DataType::QAsymmU8: - case armnn::DataType::QSymmS16: - qScale = 1.0f; - break; - default: - break; - } - armnn::TensorInfo inputXInfo({1,4,4,2}, ArmnnType, qScale, qOffset); armnn::TensorInfo inputYInfo({2,2,4,1}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputInfo({2,4,2,2}, ArmnnType, qScale, qOffset); diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index cf541f491b..7f311d8684 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -24,6 +24,7 @@ #include "workloads/NeonAdditionWorkload.hpp" #include "workloads/NeonActivationWorkload.hpp" #include "workloads/NeonArgMinMaxWorkload.hpp" +#include "workloads/NeonBatchMatMulWorkload.hpp" #include "workloads/NeonBatchNormalizationWorkload.hpp" #include "workloads/NeonBatchToSpaceNdWorkload.hpp" #include "workloads/NeonCastWorkload.hpp" @@ -171,6 +172,12 @@ bool NeonLayerSupport::IsLayerSupported(const LayerType& type, infos[1], *(PolymorphicDowncast(&descriptor)), reasonIfUnsupported); + case LayerType::BatchMatMul: + return IsBatchMatMulSupported(infos[0], + infos[1], + infos[2], + *(PolymorphicDowncast(&descriptor)), + reasonIfUnsupported); case LayerType::BatchNormalization: return IsBatchNormalizationSupported(infos[0], infos[1], @@ -627,6 +634,20 @@ bool NeonLayerSupport::IsArgMinMaxSupported(const TensorInfo& input, descriptor); } +bool NeonLayerSupport::IsBatchMatMulSupported(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor, + Optional reasonIfUnsupported) const +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchMatMulValidate, + reasonIfUnsupported, + inputX, + inputY, + output, + descriptor); +} + bool NeonLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input, const TensorInfo& output, const TensorInfo& mean, diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp index 783e6a0c46..e916162f93 100644 --- a/src/backends/neon/NeonLayerSupport.hpp +++ b/src/backends/neon/NeonLayerSupport.hpp @@ -41,6 +41,12 @@ public: const ArgMinMaxDescriptor& descriptor, Optional reasonIfUnsupported = EmptyOptional()) const override; + bool IsBatchMatMulSupported(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor, + Optional reasonIfUnsupported = EmptyOptional()) const; + bool IsBatchNormalizationSupported(const TensorInfo& input, const TensorInfo& output, const TensorInfo& mean, diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index ff9ef268ec..d5a7c684d3 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -152,6 +152,11 @@ std::unique_ptr NeonWorkloadFactory::CreateWorkload(LayerType type, auto argMinMaxQueueDescriptor = PolymorphicDowncast(&descriptor); return std::make_unique(*argMinMaxQueueDescriptor, info); } + case LayerType::BatchMatMul : + { + auto batchMatMulQueueDescriptor = PolymorphicDowncast(&descriptor); + return std::make_unique(*batchMatMulQueueDescriptor, info); + } case LayerType::BatchNormalization : { auto batchNormalizationQueueDescriptor diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk index 7c0974c13f..b1c0103426 100644 --- a/src/backends/neon/backend.mk +++ b/src/backends/neon/backend.mk @@ -26,6 +26,7 @@ BACKEND_SOURCES := \ workloads/NeonActivationWorkload.cpp \ workloads/NeonAdditionWorkload.cpp \ workloads/NeonArgMinMaxWorkload.cpp \ + workloads/NeonBatchMatMulWorkload.cpp \ workloads/NeonBatchNormalizationWorkload.cpp \ workloads/NeonBatchToSpaceNdWorkload.cpp \ workloads/NeonCastWorkload.cpp \ diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index 91fb4d77a8..88e513e62f 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -50,6 +50,23 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNchwUint1, BatchToSpaceNdNchwTest1) ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNchwUint3, BatchToSpaceNdNchwTest3) +// Batch Mat Mul +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DSimpleFloat32, BatchMatMul2DSimpleTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DSimpleFloat32, BatchMatMul3DSimpleTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMulNCHWSimpleFloat32, BatchMatMulNCHWSimpleTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBatchFloat32, BatchMatMul3DBatchTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBroadcastFloat32, BatchMatMul3DBroadcastTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3D2DBroadcastFloat32, BatchMatMul3D2DBroadcastTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTinyFloat32, BatchMatMul2DTinyTest); + +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTranspSimpleFloat32, BatchMatMul2DTranspSimpleTest); + // Convolution ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvolution1d, Convolution1dTest, true) diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt index 2209bf47e2..dd09ecf015 100644 --- a/src/backends/neon/workloads/CMakeLists.txt +++ b/src/backends/neon/workloads/CMakeLists.txt @@ -12,6 +12,8 @@ list(APPEND armnnNeonBackendWorkloads_sources NeonAdditionWorkload.hpp NeonArgMinMaxWorkload.cpp NeonArgMinMaxWorkload.hpp + NeonBatchMatMulWorkload.cpp + NeonBatchMatMulWorkload.hpp NeonBatchNormalizationWorkload.cpp NeonBatchNormalizationWorkload.hpp NeonBatchToSpaceNdWorkload.cpp diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp new file mode 100644 index 0000000000..3d8651f995 --- /dev/null +++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp @@ -0,0 +1,190 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "NeonBatchMatMulWorkload.hpp" + +#include "NeonWorkloadUtils.hpp" + +#include + +#include + +#include + +#include + +#include + + +namespace armnn +{ +arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor) +{ + if (descriptor.m_AdjointX || descriptor.m_AdjointY ) + { + throw Exception("Support for adjoint not implemented."); + } + if (descriptor.m_DataLayoutX != armnn::DataLayout::NCHW || descriptor.m_DataLayoutY != armnn::DataLayout::NCHW ) + { + throw Exception("Only supported the MatMul in the last 2 dimensions"); + } + + const auto aclInputXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputX, descriptor.m_DataLayoutX); + const auto aclInputYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputY, descriptor.m_DataLayoutY); + const auto aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + arm_compute::Status statusGEMM = arm_compute::Status(arm_compute::ErrorCode::OK); + arm_compute::Status statusPermuteX = arm_compute::Status(arm_compute::ErrorCode::OK); + arm_compute::Status statusPermuteY = arm_compute::Status(arm_compute::ErrorCode::OK); + + arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo(); + arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo(); + + if (descriptor.m_TransposeX == true) + { + auto permutationXVector = GeneratePermutationVectorOnLastTwoDimensions(inputX.GetNumDimensions()); + const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); + const TensorInfo permutedXInfo = armnnUtils::Permuted(inputX, permutationXVector); + aclPermutedXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedXInfo); + + statusPermuteX = arm_compute::NEPermute::validate(&aclInputXInfo, + &aclPermutedXInfo, + aclPermutationXVector); + } + + if (descriptor.m_TransposeY == true) + { + auto permutationYVector = GeneratePermutationVectorOnLastTwoDimensions(inputY.GetNumDimensions()); + const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); + const TensorInfo permutedYInfo = armnnUtils::Permuted(inputY, permutationYVector); + aclPermutedYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedYInfo); + + statusPermuteY = arm_compute::NEPermute::validate(&aclInputYInfo, + &aclPermutedYInfo, + aclPermutationYVector); + } + + const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped + false, // is inputY reshaped + false); // is inputY reshaped only 1st run + + statusGEMM = arm_compute::NEGEMM::validate(descriptor.m_TransposeX ? &aclPermutedXInfo : &aclInputXInfo, + descriptor.m_TransposeY ? &aclPermutedYInfo : &aclInputYInfo, + nullptr, + &aclOutputInfo, + 1.0, + 0, + gemm_info); + + if (statusPermuteX.error_code() == arm_compute::ErrorCode::OK && + statusPermuteY.error_code() == arm_compute::ErrorCode::OK && + statusGEMM.error_code() == arm_compute::ErrorCode::OK) + { + return arm_compute::Status(arm_compute::ErrorCode::OK, + "All BatchMatMul layers validate status OK."); + } + else + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, + "BatchMatMul layer validate status failed." + + statusGEMM.error_description() + + statusPermuteX.error_description() + + statusPermuteY.error_description()); + } + +} + +NeonBatchMatMulWorkload::NeonBatchMatMulWorkload( + const BatchMatMulQueueDescriptor& descriptor, const WorkloadInfo& info) + : NeonBaseWorkload(descriptor, info) +{ + if (descriptor.m_Parameters.m_AdjointX || descriptor.m_Parameters.m_AdjointY ) + { + throw Exception("Support for adjoint not implemented."); + } + if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW || + descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW ) + { + throw Exception("Only supported the MatMul in the last 2 dimensions"); + } + + // Report Profiling Details + ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct", + descriptor.m_Parameters, + info, + this->GetGuid()); + + m_Data.ValidateInputsOutputs("NeonBatchMatMulWorkload", 2, 1); + + arm_compute::ITensor& inputX = PolymorphicDowncast(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ITensor& inputY = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); + auto outputHandle = PolymorphicDowncast(m_Data.m_Outputs[0]); + arm_compute::ITensor& output = outputHandle->GetTensor(); + + arm_compute::DataLayout aclDataLayoutX = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutX); + arm_compute::DataLayout aclDataLayoutY = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutY); + + inputX.info()->set_data_layout(aclDataLayoutX); + inputY.info()->set_data_layout(aclDataLayoutY); + + if (descriptor.m_Parameters.m_TransposeX == true) + { + armnn::PermutationVector permutationXVector + = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions()); + const TensorInfo permutedXInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationXVector); + const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); + + auto permuteLayerX = std::make_unique(); + BuildArmComputeTensor(m_PermutedTensorX, permutedXInfo); + InitialiseArmComputeTensorEmpty(m_PermutedTensorX); + permuteLayerX->configure(&inputX, &m_PermutedTensorX, aclPermutationXVector); + m_PermuteLayerX.reset(permuteLayerX.release()); + } + + if (descriptor.m_Parameters.m_TransposeY == true) + { + armnn::PermutationVector permutationYVector + = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[1].GetNumDimensions()); + const TensorInfo permutedYInfo = armnnUtils::Permuted(info.m_InputTensorInfos[1], permutationYVector); + const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); + + auto permuteLayerY = std::make_unique(); + BuildArmComputeTensor(m_PermutedTensorY, permutedYInfo); + InitialiseArmComputeTensorEmpty(m_PermutedTensorY); + permuteLayerY->configure(&inputY, &m_PermutedTensorY, aclPermutationYVector); + m_PermuteLayerY.reset(permuteLayerY.release()); + } + + const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped + false, // is inputY reshaped + false); // is inputY reshaped only 1st run + auto gemmLayer = std::make_unique(); + gemmLayer->configure(descriptor.m_Parameters.m_TransposeX ? &m_PermutedTensorX : &inputX, + descriptor.m_Parameters.m_TransposeY ? &m_PermutedTensorY : &inputY, + nullptr, + &output, + 1.0, + 0, + gemm_info); + m_GEMMLayer.reset(gemmLayer.release()); +} + +void NeonBatchMatMulWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonBatchMatMulWorkload_Execute", this->GetGuid()); + if (m_PermuteLayerX) + { + m_PermuteLayerX->run(); + } + if (m_PermuteLayerY) + { + m_PermuteLayerY->run(); + } + m_GEMMLayer->run(); +} +} //namespace armnn diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp new file mode 100644 index 0000000000..cb004d2478 --- /dev/null +++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp @@ -0,0 +1,41 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "NeonBaseWorkload.hpp" + +#include +#include + +#include + +namespace armnn +{ + arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor); + + class NeonBatchMatMulWorkload : public NeonBaseWorkload + { + public: + NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, + const WorkloadInfo& info); + virtual void Execute() const override; + + private: + // ACL layers required to fully form a Batch Mat Mul layer. + std::unique_ptr m_GEMMLayer; + std::unique_ptr m_PermuteLayerX; + std::unique_ptr m_PermuteLayerY; + + // Additional ACL arm_compute::Tensors. + // Required to perform permutations. + arm_compute::Tensor m_PermutedTensorX; + arm_compute::Tensor m_PermutedTensorY; + + }; +} //namespace armnn diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp index 8f83674950..c9c5421804 100644 --- a/src/backends/neon/workloads/NeonWorkloads.hpp +++ b/src/backends/neon/workloads/NeonWorkloads.hpp @@ -8,6 +8,7 @@ #include "NeonActivationWorkload.hpp" #include "NeonAdditionWorkload.hpp" #include "NeonArgMinMaxWorkload.hpp" +#include "NeonBatchMatMulWorkload.hpp" #include "NeonBatchNormalizationWorkload.hpp" #include "NeonBatchToSpaceNdWorkload.hpp" #include "NeonCastWorkload.hpp" -- cgit v1.2.1