From 1fe6c8170ae2fe90b53fb71b7570aec9dfe75c45 Mon Sep 17 00:00:00 2001 From: Teresa Charlin Date: Tue, 1 Nov 2022 15:59:50 +0000 Subject: IVGCVSW-7307 Add CpuAcc Batch MatMul Workload * Call dedicated MatMul kernel in ACL * Add int8 tests * Add int8 to documentation * Force tensors to be dynamic (nonConst) as per request of ACL Signed-off-by: Teresa Charlin Change-Id: I992ae9aae1174214607bf29305f21cdeaf3fdc1b --- delegate/test/BatchMatMulTest.cpp | 14 +- docs/02_operator_list.dox | 3 +- .../test/BatchMatMulEndToEndTestImpl.hpp | 42 +++-- .../test/layerTests/BatchMatMulTestImpl.cpp | 20 ++- src/backends/neon/NeonLayerSupport.cpp | 18 +- src/backends/neon/NeonWorkloadFactory.cpp | 14 +- src/backends/neon/test/NeonEndToEndTests.cpp | 5 + src/backends/neon/test/NeonLayerTests.cpp | 16 +- .../neon/workloads/NeonBatchMatMulWorkload.cpp | 183 ++++++--------------- .../neon/workloads/NeonBatchMatMulWorkload.hpp | 32 ++-- src/backends/reference/test/RefEndToEndTests.cpp | 5 + 11 files changed, 173 insertions(+), 179 deletions(-) diff --git a/delegate/test/BatchMatMulTest.cpp b/delegate/test/BatchMatMulTest.cpp index c6d7bc5f32..5cd1a70141 100644 --- a/delegate/test/BatchMatMulTest.cpp +++ b/delegate/test/BatchMatMulTest.cpp @@ -663,12 +663,22 @@ namespace armnnDelegate BatchMatMul3DFp32SimpleTest (backends); BatchMatMul4DFp32SimpleTest (backends); BatchMatMul3DFp32BatchTest (backends); - BatchMatMul3DFp32BroadcastTest (backends); - BatchMatMul3D2DFp32BroadcastTest (backends); BatchMatMul2DFp32TinyTest (backends); BatchMatMulNonSquareFp32Test (backends); BatchMatMul2DFp32SimpleAdjointTest(backends); } + + TEST_CASE("BATCH_MATMUL_Int8_CpuAccTests") + { + std::vector backends = {armnn::Compute::CpuAcc}; + BatchMatMul2DInt8SimpleTest (backends); + BatchMatMul3DInt8SimpleTest (backends); + BatchMatMul4DInt8SimpleTest (backends); + BatchMatMul3DInt8BatchTest (backends); + BatchMatMul2DInt8TinyTest (backends); + BatchMatMulNonSquareInt8Test (backends); + BatchMatMul2DInt8SimpleAdjointTest(backends); + } } TEST_SUITE("BATCH_MATMUL_GpuAccTests") { diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox index 007d4f5e35..791565a985 100644 --- a/docs/02_operator_list.dox +++ b/docs/02_operator_list.dox @@ -1,4 +1,4 @@ -/// Copyright (c) 2021 ARM Limited and Contributors. All rights reserved. +/// Copyright (c) 2021, 2023 ARM Limited and Contributors. All rights reserved. /// /// SPDX-License-Identifier: MIT /// @@ -299,6 +299,7 @@ where N = batches, C = channels, H = height, W = width
FLOAT32 +
QASYMMS8
GpuAcc diff --git a/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp b/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp index 905a56d53a..98e75cb8df 100644 --- a/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp +++ b/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -55,30 +55,38 @@ void BatchMatMulEndToEnd(const std::vector& backends) const TensorShape& inputYShape = { 2, 2, 2 }; const TensorShape& outputShape = { 2, 2, 2 }; - INetworkPtr network = CreateBatchMatMulNetwork(inputXShape, inputYShape, outputShape); + constexpr float qScale = 1.0f; + constexpr int32_t qOffset = 0; + + INetworkPtr network = CreateBatchMatMulNetwork(inputXShape, inputYShape, outputShape, qScale, qOffset); CHECK(network); - std::vector inputXData{ 1, 2, - 3, 4, + std::vector floatInputXData{ 1., 2., + 3., 4., + + 9., 10., + 11., 12. }; + std::vector inputXData = armnnUtils::QuantizedVector(floatInputXData, qScale, qOffset); + + std::vector floatInputYData{ 5., 7., + 6., 8., - 9, 10, - 11, 12 }; - std::vector inputYData{ 5, 7, - 6, 8, + 13., 15., + 14., 16. }; + std::vector inputYData = armnnUtils::QuantizedVector(floatInputYData, qScale, qOffset); - 13, 15, - 14, 16 }; - std::vector expectedOutput{ 19, 22, - 43, 50, + std::vector floatExpectedOutputData{ 19., 22., + 43., 50., - 267, 286, - 323, 346 }; + 267., 286., + 323., 346. }; + std::vector expectedOutputData = armnnUtils::QuantizedVector(floatExpectedOutputData, qScale, qOffset); - std::map> inputTensorData = {{ 0, inputXData }, {1, inputYData}}; - std::map> expectedOutputData = { { 0, expectedOutput } }; + std::map> inputTensor = {{ 0, inputXData }, {1, inputYData}}; + std::map> expectedOutput = { { 0, expectedOutputData } }; - EndToEndLayerTestImpl(std::move(network), inputTensorData, expectedOutputData, backends); + EndToEndLayerTestImpl(std::move(network), inputTensor, expectedOutput, backends); } } // anonymous namespace \ No newline at end of file diff --git a/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp index 74bd97f103..504ca1d304 100644 --- a/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp +++ b/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -14,6 +14,7 @@ #include #include #include +#include template @@ -29,6 +30,7 @@ LayerTestResult BatchMatMulTestImpl( const armnn::TensorInfo& inputYInfo, const armnn::TensorInfo& outputInfo) { + LayerTestResult result(outputInfo); std::vector outputActual(outputInfo.GetNumElements()); std::unique_ptr inputXHandle = tensorHandleFactory.CreateTensorHandle(inputXInfo); @@ -36,13 +38,27 @@ LayerTestResult BatchMatMulTestImpl( std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputInfo); armnn::BatchMatMulQueueDescriptor queueDescriptor; - queueDescriptor.m_Parameters = descriptor; + queueDescriptor.m_Parameters = std::move(descriptor); armnn::WorkloadInfo workloadInfo; AddInputToWorkload(queueDescriptor, workloadInfo, inputXInfo, inputXHandle.get()); AddInputToWorkload(queueDescriptor, workloadInfo, inputYInfo, inputYHandle.get()); AddOutputToWorkload(queueDescriptor, workloadInfo, outputInfo, outputHandle.get()); + // Don't execute if BatchMatMul is not supported, as an exception will be raised. + const armnn::BackendId& backend = workloadFactory.GetBackendId(); + std::string reasonIfUnsupported; + armnn::LayerSupportHandle handle = armnn::GetILayerSupportByBackendId(backend); + result.m_Supported = handle.IsBatchMatMulSupported(inputXInfo, + inputYInfo, + outputInfo, + queueDescriptor.m_Parameters, + reasonIfUnsupported); + if (!result.m_Supported) + { + return result; + } + auto workload = workloadFactory.CreateWorkload(armnn::LayerType::BatchMatMul, queueDescriptor, workloadInfo); inputXHandle->Allocate(); diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index 4e4d7fa73d..cd4dca8edb 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -760,12 +760,28 @@ bool NeonLayerSupport::IsBatchMatMulSupported(const TensorInfo& inputX, const BatchMatMulDescriptor& descriptor, Optional reasonIfUnsupported) const { + bool isFastMathEnabled = false; +#if defined(ARMCOMPUTENEON_ENABLED) + if (m_ModelContextPtr) + { + if (m_ModelContextPtr.get() != nullptr) + { + auto modelOptions = dynamic_cast(m_ModelContextPtr.get()); + if (modelOptions) + { + isFastMathEnabled = modelOptions->IsFastMathEnabled(); + } + } + } +#endif FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchMatMulValidate, reasonIfUnsupported, inputX, inputY, output, - descriptor); + descriptor, + isFastMathEnabled, + nullptr); } bool NeonLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input, diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index 08168eca2f..c78b58d21d 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -155,7 +155,19 @@ std::unique_ptr NeonWorkloadFactory::CreateWorkload(LayerType type, case LayerType::BatchMatMul : { auto batchMatMulQueueDescriptor = PolymorphicDowncast(&descriptor); - return std::make_unique(*batchMatMulQueueDescriptor, info); + bool isFastMathEnabled = false; + if (m_ModelContextPtr) + { + if (m_ModelContextPtr.get() != nullptr) + { + auto modelOptions = dynamic_cast(m_ModelContextPtr.get()); + if (modelOptions) + { + isFastMathEnabled = modelOptions->IsFastMathEnabled(); + } + } + } + return std::make_unique(*batchMatMulQueueDescriptor, info, isFastMathEnabled); } case LayerType::BatchNormalization : { diff --git a/src/backends/neon/test/NeonEndToEndTests.cpp b/src/backends/neon/test/NeonEndToEndTests.cpp index b930004215..fb05cc415f 100644 --- a/src/backends/neon/test/NeonEndToEndTests.cpp +++ b/src/backends/neon/test/NeonEndToEndTests.cpp @@ -148,6 +148,11 @@ TEST_CASE("NeonBatchMatMulEndToEndFloat32Test") BatchMatMulEndToEnd(neonDefaultBackends); } +TEST_CASE("NeonBatchMatMulEndToEndInt8Test") +{ + BatchMatMulEndToEnd(neonDefaultBackends); +} + TEST_CASE("NeonConcatEndToEndDim0Test") { ConcatDim0EndToEnd(neonDefaultBackends); diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index 4e6b6fa1d6..715060717f 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -76,20 +76,32 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNchwUint7, BatchToSpaceNdNchwTest7); +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DSimpleInt8, BatchMatMul2DSimpleTest); ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DSimpleFloat32, BatchMatMul3DSimpleTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DSimpleInt8, BatchMatMul3DSimpleTest); ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMulNCHWSimpleFloat32, BatchMatMulNCHWSimpleTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMulNCHWSimpleInt8, BatchMatMulNCHWSimpleTest); ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBatchFloat32, BatchMatMul3DBatchTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBatchInt8, BatchMatMul3DBatchTest); -ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBroadcastFloat32, BatchMatMul3DBroadcastTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3DBroadcastFloat32, + BatchMatMul3DBroadcastTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3DBroadcastInt8, + BatchMatMul3DBroadcastTest); -ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3D2DBroadcastFloat32, BatchMatMul3D2DBroadcastTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3D2DBroadcastFloat32, + BatchMatMul3D2DBroadcastTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3D2DBroadcastInt8, + BatchMatMul3D2DBroadcastTest); ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTinyFloat32, BatchMatMul2DTinyTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTinyInt8, BatchMatMul2DTinyTest); ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTranspSimpleFloat32, BatchMatMul2DTranspSimpleTest); +ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTranspSimpleInt8, BatchMatMul2DTranspSimpleTest); // Convolution ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvolution1d, Convolution1dTest, true) diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp index 3d8651f995..9b22033bd1 100644 --- a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp +++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -8,22 +8,18 @@ #include "NeonWorkloadUtils.hpp" #include - -#include +#include #include -#include - -#include - - namespace armnn { -arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, - const TensorInfo& inputY, - const TensorInfo& output, - const BatchMatMulDescriptor& descriptor) +arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputInfoX, + const TensorInfo& inputInfoY, + const TensorInfo& outputInfo, + const BatchMatMulDescriptor& descriptor, + const bool isFastMathEnabled, + const ActivationDescriptor* activationDescriptor) { if (descriptor.m_AdjointX || descriptor.m_AdjointY ) { @@ -34,157 +30,78 @@ arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, throw Exception("Only supported the MatMul in the last 2 dimensions"); } - const auto aclInputXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputX, descriptor.m_DataLayoutX); - const auto aclInputYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputY, descriptor.m_DataLayoutY); - const auto aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + arm_compute::TensorInfo aclInputInfoX = armcomputetensorutils::BuildArmComputeTensorInfo(inputInfoX); + arm_compute::TensorInfo aclInputInfoY = armcomputetensorutils::BuildArmComputeTensorInfo(inputInfoY); + arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(outputInfo); - arm_compute::Status statusGEMM = arm_compute::Status(arm_compute::ErrorCode::OK); - arm_compute::Status statusPermuteX = arm_compute::Status(arm_compute::ErrorCode::OK); - arm_compute::Status statusPermuteY = arm_compute::Status(arm_compute::ErrorCode::OK); + // GeMM dispatches kernel handles dynamic inputs differently to static so this flag needs to be set + aclInputInfoX.set_are_values_constant(false); + aclInputInfoY.set_are_values_constant(false); - arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo(); - arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo(); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); - if (descriptor.m_TransposeX == true) - { - auto permutationXVector = GeneratePermutationVectorOnLastTwoDimensions(inputX.GetNumDimensions()); - const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); - const TensorInfo permutedXInfo = armnnUtils::Permuted(inputX, permutationXVector); - aclPermutedXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedXInfo); - - statusPermuteX = arm_compute::NEPermute::validate(&aclInputXInfo, - &aclPermutedXInfo, - aclPermutationXVector); - } + arm_compute::MatMulInfo matMulInfo; + matMulInfo.adj_lhs(descriptor.m_TransposeX); + matMulInfo.adj_rhs(descriptor.m_TransposeY); + matMulInfo.fused_activation(activationInfo); - if (descriptor.m_TransposeY == true) - { - auto permutationYVector = GeneratePermutationVectorOnLastTwoDimensions(inputY.GetNumDimensions()); - const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); - const TensorInfo permutedYInfo = armnnUtils::Permuted(inputY, permutationYVector); - aclPermutedYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedYInfo); - - statusPermuteY = arm_compute::NEPermute::validate(&aclInputYInfo, - &aclPermutedYInfo, - aclPermutationYVector); - } - - const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped - false, // is inputY reshaped - false); // is inputY reshaped only 1st run - - statusGEMM = arm_compute::NEGEMM::validate(descriptor.m_TransposeX ? &aclPermutedXInfo : &aclInputXInfo, - descriptor.m_TransposeY ? &aclPermutedYInfo : &aclInputYInfo, - nullptr, - &aclOutputInfo, - 1.0, - 0, - gemm_info); - - if (statusPermuteX.error_code() == arm_compute::ErrorCode::OK && - statusPermuteY.error_code() == arm_compute::ErrorCode::OK && - statusGEMM.error_code() == arm_compute::ErrorCode::OK) - { - return arm_compute::Status(arm_compute::ErrorCode::OK, - "All BatchMatMul layers validate status OK."); - } - else - { - return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, - "BatchMatMul layer validate status failed." - + statusGEMM.error_description() - + statusPermuteX.error_description() - + statusPermuteY.error_description()); - } + arm_compute::CpuMatMulSettings settings; + settings.fast_math(isFastMathEnabled); + return arm_compute::NEMatMul::validate(&aclInputInfoX, &aclInputInfoY, &aclOutputInfo, matMulInfo, settings); } -NeonBatchMatMulWorkload::NeonBatchMatMulWorkload( - const BatchMatMulQueueDescriptor& descriptor, const WorkloadInfo& info) +NeonBatchMatMulWorkload::NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, + const WorkloadInfo& info, + const bool isFastMathEnabled) : NeonBaseWorkload(descriptor, info) { if (descriptor.m_Parameters.m_AdjointX || descriptor.m_Parameters.m_AdjointY ) { throw Exception("Support for adjoint not implemented."); } - if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW || - descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW ) + if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW + || descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW ) { throw Exception("Only supported the MatMul in the last 2 dimensions"); } - // Report Profiling Details - ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct", - descriptor.m_Parameters, - info, - this->GetGuid()); - m_Data.ValidateInputsOutputs("NeonBatchMatMulWorkload", 2, 1); arm_compute::ITensor& inputX = PolymorphicDowncast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& inputY = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); - auto outputHandle = PolymorphicDowncast(m_Data.m_Outputs[0]); - arm_compute::ITensor& output = outputHandle->GetTensor(); + arm_compute::ITensor& output = PolymorphicDowncast(m_Data.m_Outputs[0])->GetTensor(); - arm_compute::DataLayout aclDataLayoutX = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutX); - arm_compute::DataLayout aclDataLayoutY = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutY); + // GeMM dispatches kernel handles dynamic inputs differently to static so this flag needs to be set + inputX.info()->set_are_values_constant(false); + inputY.info()->set_are_values_constant(false); - inputX.info()->set_data_layout(aclDataLayoutX); - inputY.info()->set_data_layout(aclDataLayoutY); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); - if (descriptor.m_Parameters.m_TransposeX == true) - { - armnn::PermutationVector permutationXVector - = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions()); - const TensorInfo permutedXInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationXVector); - const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector); - - auto permuteLayerX = std::make_unique(); - BuildArmComputeTensor(m_PermutedTensorX, permutedXInfo); - InitialiseArmComputeTensorEmpty(m_PermutedTensorX); - permuteLayerX->configure(&inputX, &m_PermutedTensorX, aclPermutationXVector); - m_PermuteLayerX.reset(permuteLayerX.release()); - } + arm_compute::MatMulInfo matMulInfo; + matMulInfo.adj_lhs(descriptor.m_Parameters.m_TransposeX); + matMulInfo.adj_rhs(descriptor.m_Parameters.m_TransposeY); + matMulInfo.fused_activation(activationInfo); - if (descriptor.m_Parameters.m_TransposeY == true) - { - armnn::PermutationVector permutationYVector - = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[1].GetNumDimensions()); - const TensorInfo permutedYInfo = armnnUtils::Permuted(info.m_InputTensorInfos[1], permutationYVector); - const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector); - - auto permuteLayerY = std::make_unique(); - BuildArmComputeTensor(m_PermutedTensorY, permutedYInfo); - InitialiseArmComputeTensorEmpty(m_PermutedTensorY); - permuteLayerY->configure(&inputY, &m_PermutedTensorY, aclPermutationYVector); - m_PermuteLayerY.reset(permuteLayerY.release()); - } + arm_compute::CpuMatMulSettings settings; + settings.fast_math(isFastMathEnabled); + + m_MatMulLayer.configure(&inputX, &inputY, &output, matMulInfo, settings); - const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped - false, // is inputY reshaped - false); // is inputY reshaped only 1st run - auto gemmLayer = std::make_unique(); - gemmLayer->configure(descriptor.m_Parameters.m_TransposeX ? &m_PermutedTensorX : &inputX, - descriptor.m_Parameters.m_TransposeY ? &m_PermutedTensorY : &inputY, - nullptr, - &output, - 1.0, - 0, - gemm_info); - m_GEMMLayer.reset(gemmLayer.release()); + // Report Profiling Details + WorkloadInfo detailsInfo; + detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos; + detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos; + ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct", + descriptor.m_Parameters, + detailsInfo, + GetGuid()); } void NeonBatchMatMulWorkload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonBatchMatMulWorkload_Execute", this->GetGuid()); - if (m_PermuteLayerX) - { - m_PermuteLayerX->run(); - } - if (m_PermuteLayerY) - { - m_PermuteLayerY->run(); - } - m_GEMMLayer->run(); + m_MatMulLayer.run(); } } //namespace armnn diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp index cb004d2478..27144f2400 100644 --- a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp +++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -7,35 +7,27 @@ #include "NeonBaseWorkload.hpp" -#include -#include - -#include +#include namespace armnn { - arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX, - const TensorInfo& inputY, - const TensorInfo& output, - const BatchMatMulDescriptor& descriptor); + arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputInfoX, + const TensorInfo& inputInfoY, + const TensorInfo& outputInfo, + const BatchMatMulDescriptor& descriptor, + const bool isFastMathEnabled, + const ActivationDescriptor* activationDescriptor); + class NeonBatchMatMulWorkload : public NeonBaseWorkload { public: NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor, - const WorkloadInfo& info); + const WorkloadInfo& info, + const bool isFastMathEnabled); virtual void Execute() const override; private: - // ACL layers required to fully form a Batch Mat Mul layer. - std::unique_ptr m_GEMMLayer; - std::unique_ptr m_PermuteLayerX; - std::unique_ptr m_PermuteLayerY; - - // Additional ACL arm_compute::Tensors. - // Required to perform permutations. - arm_compute::Tensor m_PermutedTensorX; - arm_compute::Tensor m_PermutedTensorY; - + mutable arm_compute::NEMatMul m_MatMulLayer; }; } //namespace armnn diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp index 8bf414fdb0..95004c4dc2 100644 --- a/src/backends/reference/test/RefEndToEndTests.cpp +++ b/src/backends/reference/test/RefEndToEndTests.cpp @@ -486,6 +486,11 @@ TEST_CASE("RefBatchMatMulEndToEndFloat32Test") BatchMatMulEndToEnd(defaultBackends); } +TEST_CASE("RefBatchMatMulEndToEndInt8Test") +{ + BatchMatMulEndToEnd(defaultBackends); +} + TEST_CASE("RefBatchToSpaceNdEndToEndFloat32NHWCTest") { BatchToSpaceNdEndToEnd(defaultBackends, armnn::DataLayout::NHWC); -- cgit v1.2.1