From 1fe6c8170ae2fe90b53fb71b7570aec9dfe75c45 Mon Sep 17 00:00:00 2001 From: Teresa Charlin Date: Tue, 1 Nov 2022 15:59:50 +0000 Subject: IVGCVSW-7307 Add CpuAcc Batch MatMul Workload * Call dedicated MatMul kernel in ACL * Add int8 tests * Add int8 to documentation * Force tensors to be dynamic (nonConst) as per request of ACL Signed-off-by: Teresa Charlin Change-Id: I992ae9aae1174214607bf29305f21cdeaf3fdc1b --- .../neon/workloads/NeonBatchMatMulWorkload.cpp | 183 ++++++--------------- .../neon/workloads/NeonBatchMatMulWorkload.hpp | 32 ++-- 2 files changed, 62 insertions(+), 153 deletions(-) (limited to 'src/backends/neon/workloads') diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp index 3d8651f995..9b22033bd1 100644 --- a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp +++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -8,22 +8,18 @@ #include "NeonWorkloadUtils.hpp" #include - -#include +#include #include -#include - -#include - - namespace armnn { -arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, - const TensorInfo& inputY, - const TensorInfo& output, - const BatchMatMulDescriptor& descriptor) +arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputInfoX, + const TensorInfo& inputInfoY, + const TensorInfo& outputInfo, + const BatchMatMulDescriptor& descriptor, + const bool isFastMathEnabled, + const ActivationDescriptor* activationDescriptor) { if (descriptor.m_AdjointX || descriptor.m_AdjointY ) { @@ -34,157 +30,78 @@ arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, throw Exception("Only supported the MatMul in the last 2 dimensions"); } - const auto aclInputXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputX, descriptor.m_DataLayoutX); - const auto aclInputYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputY, descriptor.m_DataLayoutY); - const auto aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + arm_compute::TensorInfo aclInputInfoX = armcomputetensorutils::BuildArmComputeTensorInfo(inputInfoX); + arm_compute::TensorInfo aclInputInfoY = armcomputetensorutils::BuildArmComputeTensorInfo(inputInfoY); + arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(outputInfo); - arm_compute::Status statusGEMM = arm_compute::Status(arm_compute::ErrorCode::OK); - arm_compute::Status statusPermuteX = arm_compute::Status(arm_compute::ErrorCode::OK); - arm_compute::Status statusPermuteY = arm_compute::Status(arm_compute::ErrorCode::OK); + // GeMM dispatches kernel handles dynamic inputs differently to static so this flag needs to be set + aclInputInfoX.set_are_values_constant(false); + aclInputInfoY.set_are_values_constant(false); - arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo(); - arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo(); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); - if (descriptor.m_TransposeX == true) - { - auto permutationXVector = GeneratePermutationVectorOnLastTwoDimensions(inputX.GetNumDimensions()); - const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); - const TensorInfo permutedXInfo = armnnUtils::Permuted(inputX, permutationXVector); - aclPermutedXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedXInfo); - - statusPermuteX = arm_compute::NEPermute::validate(&aclInputXInfo, - &aclPermutedXInfo, - aclPermutationXVector); - } + arm_compute::MatMulInfo matMulInfo; + matMulInfo.adj_lhs(descriptor.m_TransposeX); + matMulInfo.adj_rhs(descriptor.m_TransposeY); + matMulInfo.fused_activation(activationInfo); - if (descriptor.m_TransposeY == true) - { - auto permutationYVector = GeneratePermutationVectorOnLastTwoDimensions(inputY.GetNumDimensions()); - const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); - const TensorInfo permutedYInfo = armnnUtils::Permuted(inputY, permutationYVector); - aclPermutedYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedYInfo); - - statusPermuteY = arm_compute::NEPermute::validate(&aclInputYInfo, - &aclPermutedYInfo, - aclPermutationYVector); - } - - const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped - false, // is inputY reshaped - false); // is inputY reshaped only 1st run - - statusGEMM = arm_compute::NEGEMM::validate(descriptor.m_TransposeX ? &aclPermutedXInfo : &aclInputXInfo, - descriptor.m_TransposeY ? &aclPermutedYInfo : &aclInputYInfo, - nullptr, - &aclOutputInfo, - 1.0, - 0, - gemm_info); - - if (statusPermuteX.error_code() == arm_compute::ErrorCode::OK && - statusPermuteY.error_code() == arm_compute::ErrorCode::OK && - statusGEMM.error_code() == arm_compute::ErrorCode::OK) - { - return arm_compute::Status(arm_compute::ErrorCode::OK, - "All BatchMatMul layers validate status OK."); - } - else - { - return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, - "BatchMatMul layer validate status failed." - + statusGEMM.error_description() - + statusPermuteX.error_description() - + statusPermuteY.error_description()); - } + arm_compute::CpuMatMulSettings settings; + settings.fast_math(isFastMathEnabled); + return arm_compute::NEMatMul::validate(&aclInputInfoX, &aclInputInfoY, &aclOutputInfo, matMulInfo, settings); } -NeonBatchMatMulWorkload::NeonBatchMatMulWorkload( - const BatchMatMulQueueDescriptor& descriptor, const WorkloadInfo& info) +NeonBatchMatMulWorkload::NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, + const WorkloadInfo& info, + const bool isFastMathEnabled) : NeonBaseWorkload(descriptor, info) { if (descriptor.m_Parameters.m_AdjointX || descriptor.m_Parameters.m_AdjointY ) { throw Exception("Support for adjoint not implemented."); } - if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW || - descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW ) + if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW + || descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW ) { throw Exception("Only supported the MatMul in the last 2 dimensions"); } - // Report Profiling Details - ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct", - descriptor.m_Parameters, - info, - this->GetGuid()); - m_Data.ValidateInputsOutputs("NeonBatchMatMulWorkload", 2, 1); arm_compute::ITensor& inputX = PolymorphicDowncast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& inputY = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); - auto outputHandle = PolymorphicDowncast(m_Data.m_Outputs[0]); - arm_compute::ITensor& output = outputHandle->GetTensor(); + arm_compute::ITensor& output = PolymorphicDowncast(m_Data.m_Outputs[0])->GetTensor(); - arm_compute::DataLayout aclDataLayoutX = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutX); - arm_compute::DataLayout aclDataLayoutY = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutY); + // GeMM dispatches kernel handles dynamic inputs differently to static so this flag needs to be set + inputX.info()->set_are_values_constant(false); + inputY.info()->set_are_values_constant(false); - inputX.info()->set_data_layout(aclDataLayoutX); - inputY.info()->set_data_layout(aclDataLayoutY); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); - if (descriptor.m_Parameters.m_TransposeX == true) - { - armnn::PermutationVector permutationXVector - = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions()); - const TensorInfo permutedXInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationXVector); - const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); - - auto permuteLayerX = std::make_unique(); - BuildArmComputeTensor(m_PermutedTensorX, permutedXInfo); - InitialiseArmComputeTensorEmpty(m_PermutedTensorX); - permuteLayerX->configure(&inputX, &m_PermutedTensorX, aclPermutationXVector); - m_PermuteLayerX.reset(permuteLayerX.release()); - } + arm_compute::MatMulInfo matMulInfo; + matMulInfo.adj_lhs(descriptor.m_Parameters.m_TransposeX); + matMulInfo.adj_rhs(descriptor.m_Parameters.m_TransposeY); + matMulInfo.fused_activation(activationInfo); - if (descriptor.m_Parameters.m_TransposeY == true) - { - armnn::PermutationVector permutationYVector - = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[1].GetNumDimensions()); - const TensorInfo permutedYInfo = armnnUtils::Permuted(info.m_InputTensorInfos[1], permutationYVector); - const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); - - auto permuteLayerY = std::make_unique(); - BuildArmComputeTensor(m_PermutedTensorY, permutedYInfo); - InitialiseArmComputeTensorEmpty(m_PermutedTensorY); - permuteLayerY->configure(&inputY, &m_PermutedTensorY, aclPermutationYVector); - m_PermuteLayerY.reset(permuteLayerY.release()); - } + arm_compute::CpuMatMulSettings settings; + settings.fast_math(isFastMathEnabled); + + m_MatMulLayer.configure(&inputX, &inputY, &output, matMulInfo, settings); - const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped - false, // is inputY reshaped - false); // is inputY reshaped only 1st run - auto gemmLayer = std::make_unique(); - gemmLayer->configure(descriptor.m_Parameters.m_TransposeX ? &m_PermutedTensorX : &inputX, - descriptor.m_Parameters.m_TransposeY ? &m_PermutedTensorY : &inputY, - nullptr, - &output, - 1.0, - 0, - gemm_info); - m_GEMMLayer.reset(gemmLayer.release()); + // Report Profiling Details + WorkloadInfo detailsInfo; + detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos; + detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos; + ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct", + descriptor.m_Parameters, + detailsInfo, + GetGuid()); } void NeonBatchMatMulWorkload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonBatchMatMulWorkload_Execute", this->GetGuid()); - if (m_PermuteLayerX) - { - m_PermuteLayerX->run(); - } - if (m_PermuteLayerY) - { - m_PermuteLayerY->run(); - } - m_GEMMLayer->run(); + m_MatMulLayer.run(); } } //namespace armnn diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp index cb004d2478..27144f2400 100644 --- a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp +++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -7,35 +7,27 @@ #include "NeonBaseWorkload.hpp" -#include -#include - -#include +#include namespace armnn { - arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, - const TensorInfo& inputY, - const TensorInfo& output, - const BatchMatMulDescriptor& descriptor); + arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputInfoX, + const TensorInfo& inputInfoY, + const TensorInfo& outputInfo, + const BatchMatMulDescriptor& descriptor, + const bool isFastMathEnabled, + const ActivationDescriptor* activationDescriptor); + class NeonBatchMatMulWorkload : public NeonBaseWorkload { public: NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, - const WorkloadInfo& info); + const WorkloadInfo& info, + const bool isFastMathEnabled); virtual void Execute() const override; private: - // ACL layers required to fully form a Batch Mat Mul layer. - std::unique_ptr m_GEMMLayer; - std::unique_ptr m_PermuteLayerX; - std::unique_ptr m_PermuteLayerY; - - // Additional ACL arm_compute::Tensors. - // Required to perform permutations. - arm_compute::Tensor m_PermutedTensorX; - arm_compute::Tensor m_PermutedTensorY; - + mutable arm_compute::NEMatMul m_MatMulLayer; }; } //namespace armnn -- cgit v1.2.1