From 94916a5c06065bca0b232106bd4ae68f9986b7b0 Mon Sep 17 00:00:00 2001 From: Teresa Charlin Date: Wed, 19 Oct 2022 08:48:07 +0100 Subject: IVGCVSW-6493 Add GpuAcc Batch MatMul workload Fp32 * GpuAcc only supports up to 3D, so no 4D test have been added Signed-off-by: Teresa Charlin Change-Id: Ie926cd45c350be624cbdc6cb27c89d2d3f60884b --- delegate/src/test/BatchMatMulTest.cpp | 17 +- docs/02_operator_list.dox | 9 +- src/backends/cl/ClLayerSupport.cpp | 21 +++ src/backends/cl/ClLayerSupport.hpp | 6 + src/backends/cl/ClWorkloadFactory.cpp | 5 + src/backends/cl/backend.mk | 1 + src/backends/cl/test/ClLayerTests.cpp | 23 +++ src/backends/cl/workloads/CMakeLists.txt | 2 + .../cl/workloads/ClBatchMatMulWorkload.cpp | 203 +++++++++++++++++++++ .../cl/workloads/ClBatchMatMulWorkload.hpp | 41 +++++ src/backends/cl/workloads/ClWorkloads.hpp | 1 + 11 files changed, 324 insertions(+), 5 deletions(-) create mode 100644 src/backends/cl/workloads/ClBatchMatMulWorkload.cpp create mode 100644 src/backends/cl/workloads/ClBatchMatMulWorkload.hpp diff --git a/delegate/src/test/BatchMatMulTest.cpp b/delegate/src/test/BatchMatMulTest.cpp index e5cb976c45..d13d8dcf43 100644 --- a/delegate/src/test/BatchMatMulTest.cpp +++ b/delegate/src/test/BatchMatMulTest.cpp @@ -268,7 +268,7 @@ namespace armnnDelegate { // Set input data std::vector LHSInputShape { 2,2,2 }; - std::vector RHSInputShape { 1,2,2 }; + std::vector RHSInputShape { 2,2 }; std::vector outputShape { 2,2,2 }; std::vector LHSInputValues = { 1, 2, @@ -670,4 +670,19 @@ namespace armnnDelegate BatchMatMul2DFp32SimpleAdjointTest(backends); } } + TEST_SUITE("BATCH_MATMUL_GpuAccTests") + { + TEST_CASE("BATCH_MATMUL_Fp32_GpuAccTests") + { + std::vector backends = {armnn::Compute::GpuAcc}; + BatchMatMul2DFp32SimpleTest (backends); + BatchMatMul3DFp32SimpleTest (backends); + BatchMatMul3DFp32BatchTest (backends); + BatchMatMul3DFp32BroadcastTest (backends); + BatchMatMul3D2DFp32BroadcastTest (backends); + BatchMatMul2DFp32TinyTest (backends); + BatchMatMulNonSquareFp32Test (backends); + BatchMatMul2DFp32SimpleAdjointTest(backends); + } + } } diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox index d9a3d2c83b..007d4f5e35 100644 --- a/docs/02_operator_list.dox +++ b/docs/02_operator_list.dox @@ -304,12 +304,13 @@ where N = batches, C = channels, H = height, W = width GpuAcc
    -
  • N/A +
  • All
-
    -
  • N/A -
+ +
+
FLOAT32 +
BatchNormalizationLayer Layer to perform batch normalization. diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp index a61a5bb640..cb2d756037 100644 --- a/src/backends/cl/ClLayerSupport.cpp +++ b/src/backends/cl/ClLayerSupport.cpp @@ -22,6 +22,7 @@ #include "workloads/ClAdditionWorkload.hpp" #include "workloads/ClActivationWorkload.hpp" #include "workloads/ClArgMinMaxWorkload.hpp" +#include "workloads/ClBatchMatMulWorkload.hpp" #include "workloads/ClBatchNormalizationFloatWorkload.hpp" #include "workloads/ClBatchToSpaceNdWorkload.hpp" #include "workloads/ClCastWorkload.hpp" @@ -201,6 +202,12 @@ bool ClLayerSupport::IsLayerSupported(const LayerType& type, infos[1], *(PolymorphicDowncast(&descriptor)), reasonIfUnsupported); + case LayerType::BatchMatMul: + return IsBatchMatMulSupported(infos[0], + infos[1], + infos[2], + *(PolymorphicDowncast(&descriptor)), + reasonIfUnsupported); case LayerType::BatchNormalization: return IsBatchNormalizationSupported(infos[0], infos[1], @@ -640,6 +647,20 @@ bool ClLayerSupport::IsArgMinMaxSupported(const TensorInfo& input, descriptor); } +bool ClLayerSupport::IsBatchMatMulSupported(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor, + Optional reasonIfUnsupported) const +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClBatchMatMulValidate, + reasonIfUnsupported, + inputX, + inputY, + output, + descriptor); +} + bool ClLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input, const TensorInfo& output, const TensorInfo& mean, diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp index 27311f74aa..2d784e3df8 100644 --- a/src/backends/cl/ClLayerSupport.hpp +++ b/src/backends/cl/ClLayerSupport.hpp @@ -40,6 +40,12 @@ public: const ArgMinMaxDescriptor& descriptor, Optional reasonIfUnsupported = EmptyOptional()) const override; + bool IsBatchMatMulSupported(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor, + Optional reasonIfUnsupported = EmptyOptional()) const; + bool IsBatchNormalizationSupported(const TensorInfo& input, const TensorInfo& output, const TensorInfo& mean, diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp index d0079abd38..6bf510a2ef 100644 --- a/src/backends/cl/ClWorkloadFactory.cpp +++ b/src/backends/cl/ClWorkloadFactory.cpp @@ -265,6 +265,11 @@ std::unique_ptr ClWorkloadFactory::CreateWorkload(LayerType type, auto argMinMaxQueueDescriptor = PolymorphicDowncast(&descriptor); return MakeWorkload(*argMinMaxQueueDescriptor, info, m_CLCompileContext); } + case LayerType::BatchMatMul : + { + auto batchMatMulQueueDescriptor = PolymorphicDowncast(&descriptor); + return std::make_unique(*batchMatMulQueueDescriptor, info, m_CLCompileContext); + } case LayerType::BatchNormalization : { auto batchNormalizationQueueDescriptor diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk index 6fda16db05..1f97ae7cc8 100644 --- a/src/backends/cl/backend.mk +++ b/src/backends/cl/backend.mk @@ -30,6 +30,7 @@ BACKEND_SOURCES := \ workloads/ClActivationWorkload.cpp \ workloads/ClAdditionWorkload.cpp \ workloads/ClArgMinMaxWorkload.cpp \ + workloads/ClBatchMatMulWorkload.cpp \ workloads/ClBatchNormalizationFloatWorkload.cpp \ workloads/ClBatchToSpaceNdWorkload.cpp \ workloads/ClCastWorkload.cpp \ diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp index 855697c9be..4ba2a9ec3b 100644 --- a/src/backends/cl/test/ClLayerTests.cpp +++ b/src/backends/cl/test/ClLayerTests.cpp @@ -73,6 +73,29 @@ ARMNN_AUTO_TEST_FIXTURE_WITH_THF(Tanh, ClContextControlFixture, TanhTest) // Elu Activation ARMNN_AUTO_TEST_FIXTURE_WITH_THF(Elu, ClContextControlFixture, EluTest) +// Batch Mat Mul +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul2DSimpleFloat32, + ClContextControlFixture, + BatchMatMul2DSimpleTest); +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3DSimpleFloat32, + ClContextControlFixture, + BatchMatMul3DSimpleTest); +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3DBatchFloat32, + ClContextControlFixture, + BatchMatMul3DBatchTest); +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3DBroadcastFloat32, + ClContextControlFixture, + BatchMatMul3DBroadcastTest); +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3D2DBroadcastFloat32, + ClContextControlFixture, + BatchMatMul3D2DBroadcastTest); +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul2DTinyFloat32, + ClContextControlFixture, + BatchMatMul2DTinyTest); +ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul2DTranspSimpleFloat32, + ClContextControlFixture, + BatchMatMul2DTranspSimpleTest); + // Batch To Space ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchToSpaceNdNhwcFloat321, ClContextControlFixture, diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt index aef7fc7ad2..8616dec078 100644 --- a/src/backends/cl/workloads/CMakeLists.txt +++ b/src/backends/cl/workloads/CMakeLists.txt @@ -12,6 +12,8 @@ list(APPEND armnnClBackendWorkloads_sources ClAdditionWorkload.hpp ClArgMinMaxWorkload.cpp ClArgMinMaxWorkload.hpp + ClBatchMatMulWorkload.cpp + ClBatchMatMulWorkload.hpp ClBatchNormalizationFloatWorkload.cpp ClBatchNormalizationFloatWorkload.hpp ClBatchToSpaceNdWorkload.cpp diff --git a/src/backends/cl/workloads/ClBatchMatMulWorkload.cpp b/src/backends/cl/workloads/ClBatchMatMulWorkload.cpp new file mode 100644 index 0000000000..4acdef5e5c --- /dev/null +++ b/src/backends/cl/workloads/ClBatchMatMulWorkload.cpp @@ -0,0 +1,203 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "ClBatchMatMulWorkload.hpp" + +#include "ClWorkloadUtils.hpp" + +#include +#include + +#include + +#include + +#include + +#include + +#include +#include + + +namespace armnn +{ +arm_compute::Status ClBatchMatMulValidate(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor) +{ + if (descriptor.m_AdjointX || descriptor.m_AdjointY ) + { + throw Exception("Support for adjoint not implemented."); + } + if (descriptor.m_DataLayoutX != armnn::DataLayout::NCHW || descriptor.m_DataLayoutY != armnn::DataLayout::NCHW ) + { + throw Exception("Only supported the MatMul in the last 2 dimensions"); + } + + arm_compute::Status statusGEMM = arm_compute::Status(arm_compute::ErrorCode::OK); + arm_compute::Status statusPermuteX = arm_compute::Status(arm_compute::ErrorCode::OK); + arm_compute::Status statusPermuteY = arm_compute::Status(arm_compute::ErrorCode::OK); + + const auto aclInputXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputX, descriptor.m_DataLayoutX); + const auto aclInputYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputY, descriptor.m_DataLayoutY); + const auto aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo(); + arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo(); + + if (descriptor.m_TransposeX == true) + { + auto permutationXVector = GeneratePermutationVectorOnLastTwoDimensions(inputX.GetNumDimensions()); + const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); + const TensorInfo permutedXInfo = armnnUtils::Permuted(inputX, permutationXVector); + aclPermutedXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedXInfo); + + statusPermuteX = arm_compute::CLPermute::validate(&aclInputXInfo, + &aclPermutedXInfo, + aclPermutationXVector); + } + + if ( descriptor.m_TransposeY == true) + { + auto permutationYVector = GeneratePermutationVectorOnLastTwoDimensions(inputY.GetNumDimensions()); + const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); + const TensorInfo permutedYInfo = armnnUtils::Permuted(inputY, permutationYVector); + aclPermutedYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedYInfo); + + statusPermuteY = arm_compute::CLPermute::validate(&aclInputYInfo, + &aclPermutedYInfo, + aclPermutationYVector); + + } + + const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped + false, // is inputY reshaped + false); // is inputY reshaped only 1st run + + + statusGEMM = arm_compute::CLGEMM::validate(descriptor.m_TransposeX ? &aclPermutedXInfo : &aclInputXInfo, + descriptor.m_TransposeY ? &aclPermutedYInfo : &aclInputYInfo, + nullptr, + &aclOutputInfo, + 1.0, + 0, + gemm_info); + + if (statusPermuteX.error_code() == arm_compute::ErrorCode::OK && + statusPermuteY.error_code() == arm_compute::ErrorCode::OK && + statusGEMM.error_code() == arm_compute::ErrorCode::OK) + { + return arm_compute::Status(arm_compute::ErrorCode::OK, + "All Batch Mat Mul layers validate status OK."); + } + else + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, + "BatchMatMul layer validate status failed." + + statusGEMM.error_description() + + statusPermuteX.error_description() + + statusPermuteY.error_description()); + } + +} + +ClBatchMatMulWorkload::ClBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, + const WorkloadInfo& info, + const arm_compute::CLCompileContext& clCompileContext) + : ClBaseWorkload(descriptor, info) +{ + // Report Profiling Details + ARMNN_REPORT_PROFILING_WORKLOAD_DESC("ClBatchMatMulWorkload_Construct", + descriptor.m_Parameters, + info, + this->GetGuid()); + + if (descriptor.m_Parameters.m_AdjointX || descriptor.m_Parameters.m_AdjointY ) + { + throw Exception("Support for adjoint not implemented."); + } + if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW || + descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW ) + { + throw Exception("Only supported the MatMul in the last 2 dimensions"); + } + + m_Data.ValidateInputsOutputs("ClBatchMatMulWorkload", 2, 1); + + const arm_compute::ICLTensor& inputX = PolymorphicDowncast(m_Data.m_Inputs[0])->GetTensor(); + const arm_compute::ICLTensor& inputY = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); + arm_compute::ICLTensor& output = PolymorphicDowncast(m_Data.m_Outputs[0])->GetTensor(); + + inputX.info()->set_data_layout(armcomputetensorutils::ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutX)); + inputY.info()->set_data_layout(armcomputetensorutils::ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutY)); + + arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo(); + arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo(); + + if (descriptor.m_Parameters.m_TransposeX == true) + { + armnn::PermutationVector permutationXVector + = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions()); + const TensorInfo permutedXInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationXVector); + const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); + armcomputetensorutils::BuildArmComputeTensor(m_PermutedTensorX, permutedXInfo); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermutedTensorX); + + auto permuteLayerX = std::make_unique(); + permuteLayerX->configure(clCompileContext, + &inputX, + &m_PermutedTensorX, + aclPermutationXVector); + m_PermuteLayerX.reset(permuteLayerX.release()); + } + + if (descriptor.m_Parameters.m_TransposeY == true) + { + armnn::PermutationVector permutationYVector + = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions()); + const TensorInfo permutedYInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationYVector); + const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); + armcomputetensorutils::BuildArmComputeTensor(m_PermutedTensorY, permutedYInfo); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermutedTensorY); + + std::unique_ptr permuteLayerY(new arm_compute::CLPermute()); + permuteLayerY->configure(clCompileContext, + &inputY, + &m_PermutedTensorY, + aclPermutationYVector); + m_PermuteLayerY.reset(permuteLayerY.release()); + } + + const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped + false, // is inputY reshaped + false); // is inputY reshaped only 1st run + auto gemmLayer = std::make_unique(); + gemmLayer->configure(clCompileContext, + descriptor.m_Parameters.m_TransposeX ? &m_PermutedTensorX : &inputX, + descriptor.m_Parameters.m_TransposeY ? &m_PermutedTensorY : &inputY, + nullptr, + &output, + 1.0, + 0, + gemm_info); + m_GEMMLayer.reset(gemmLayer.release()); +} + +void ClBatchMatMulWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL_GUID("ClBatchMatMulWorkload_Execute", this->GetGuid()); + if (m_PermuteLayerX) + { + m_PermuteLayerX->run(); + } + if (m_PermuteLayerY) + { + m_PermuteLayerY->run(); + } + m_GEMMLayer->run(); +} +} //namespace armnn diff --git a/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp b/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp new file mode 100644 index 0000000000..5277efc947 --- /dev/null +++ b/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp @@ -0,0 +1,41 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "ClBaseWorkload.hpp" + +#include +#include +#include + +namespace armnn +{ + arm_compute::Status ClBatchMatMulValidate(const TensorInfo& inputX, + const TensorInfo& inputY, + const TensorInfo& output, + const BatchMatMulDescriptor& descriptor); + + class ClBatchMatMulWorkload : public ClBaseWorkload + { + public: + ClBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, + const WorkloadInfo& info, + const arm_compute::CLCompileContext& clCompileContext); + virtual void Execute() const override; + + private: + // ACL layers required to fully form a Batch Mat Mul layer. + std::unique_ptr m_GEMMLayer; + std::unique_ptr m_PermuteLayerX; + std::unique_ptr m_PermuteLayerY; + + // Additional CL arm_compute::Tensors. + // Required to perform permutations. + arm_compute::CLTensor m_PermutedTensorX; + arm_compute::CLTensor m_PermutedTensorY; + + }; +} //namespace armnn diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp index c3a79b7583..44f3798d7d 100644 --- a/src/backends/cl/workloads/ClWorkloads.hpp +++ b/src/backends/cl/workloads/ClWorkloads.hpp @@ -10,6 +10,7 @@ #include "ClArgMinMaxWorkload.hpp" #include "ClComparisonWorkload.hpp" #include "ClConstantWorkload.hpp" +#include "ClBatchMatMulWorkload.hpp" #include "ClBatchNormalizationFloatWorkload.hpp" #include "ClBatchToSpaceNdWorkload.hpp" #include "ClCastWorkload.hpp" -- cgit v1.2.1