aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTeresa Charlin <teresa.charlinreyes@arm.com>2022-11-01 15:59:50 +0000
committerTeresaARM <teresa.charlinreyes@arm.com>2023-05-08 13:16:01 +0000
commit1fe6c8170ae2fe90b53fb71b7570aec9dfe75c45 (patch)
treebbb846edda64445c1e033b182e5a079c8d5728d8
parentc52190a7e80cf238ba1d8630e5cc36ec7c7849e2 (diff)
downloadarmnn-1fe6c8170ae2fe90b53fb71b7570aec9dfe75c45.tar.gz
IVGCVSW-7307 Add CpuAcc Batch MatMul Workload
* Call dedicated MatMul kernel in ACL * Add int8 tests * Add int8 to documentation * Force tensors to be dynamic (nonConst) as per request of ACL Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com> Change-Id: I992ae9aae1174214607bf29305f21cdeaf3fdc1b
-rw-r--r--delegate/test/BatchMatMulTest.cpp14
-rw-r--r--docs/02_operator_list.dox3
-rw-r--r--src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp42
-rw-r--r--src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp20
-rw-r--r--src/backends/neon/NeonLayerSupport.cpp18
-rw-r--r--src/backends/neon/NeonWorkloadFactory.cpp14
-rw-r--r--src/backends/neon/test/NeonEndToEndTests.cpp5
-rw-r--r--src/backends/neon/test/NeonLayerTests.cpp16
-rw-r--r--src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp183
-rw-r--r--src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp32
-rw-r--r--src/backends/reference/test/RefEndToEndTests.cpp5
11 files changed, 173 insertions, 179 deletions
diff --git a/delegate/test/BatchMatMulTest.cpp b/delegate/test/BatchMatMulTest.cpp
index c6d7bc5f32..5cd1a70141 100644
--- a/delegate/test/BatchMatMulTest.cpp
+++ b/delegate/test/BatchMatMulTest.cpp
@@ -663,12 +663,22 @@ namespace armnnDelegate
BatchMatMul3DFp32SimpleTest (backends);
BatchMatMul4DFp32SimpleTest (backends);
BatchMatMul3DFp32BatchTest (backends);
- BatchMatMul3DFp32BroadcastTest (backends);
- BatchMatMul3D2DFp32BroadcastTest (backends);
BatchMatMul2DFp32TinyTest (backends);
BatchMatMulNonSquareFp32Test (backends);
BatchMatMul2DFp32SimpleAdjointTest(backends);
}
+
+ TEST_CASE("BATCH_MATMUL_Int8_CpuAccTests")
+ {
+ std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+ BatchMatMul2DInt8SimpleTest (backends);
+ BatchMatMul3DInt8SimpleTest (backends);
+ BatchMatMul4DInt8SimpleTest (backends);
+ BatchMatMul3DInt8BatchTest (backends);
+ BatchMatMul2DInt8TinyTest (backends);
+ BatchMatMulNonSquareInt8Test (backends);
+ BatchMatMul2DInt8SimpleAdjointTest(backends);
+ }
}
TEST_SUITE("BATCH_MATMUL_GpuAccTests")
{
diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox
index 007d4f5e35..791565a985 100644
--- a/docs/02_operator_list.dox
+++ b/docs/02_operator_list.dox
@@ -1,4 +1,4 @@
-/// Copyright (c) 2021 ARM Limited and Contributors. All rights reserved.
+/// Copyright (c) 2021, 2023 ARM Limited and Contributors. All rights reserved.
///
/// SPDX-License-Identifier: MIT
///
@@ -299,6 +299,7 @@ where N = batches, C = channels, H = height, W = width
<table>
<tr><th>
<tr><td>FLOAT32
+ <tr><td>QASYMMS8
</table>
<tr>
<td>GpuAcc
diff --git a/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp b/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp
index 905a56d53a..98e75cb8df 100644
--- a/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp
+++ b/src/backends/backendsCommon/test/BatchMatMulEndToEndTestImpl.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
@@ -55,30 +55,38 @@ void BatchMatMulEndToEnd(const std::vector<armnn::BackendId>& backends)
const TensorShape& inputYShape = { 2, 2, 2 };
const TensorShape& outputShape = { 2, 2, 2 };
- INetworkPtr network = CreateBatchMatMulNetwork<ArmnnType>(inputXShape, inputYShape, outputShape);
+ constexpr float qScale = 1.0f;
+ constexpr int32_t qOffset = 0;
+
+ INetworkPtr network = CreateBatchMatMulNetwork<ArmnnType>(inputXShape, inputYShape, outputShape, qScale, qOffset);
CHECK(network);
- std::vector<T> inputXData{ 1, 2,
- 3, 4,
+ std::vector<float> floatInputXData{ 1., 2.,
+ 3., 4.,
+
+ 9., 10.,
+ 11., 12. };
+ std::vector<T> inputXData = armnnUtils::QuantizedVector<T>(floatInputXData, qScale, qOffset);
+
+ std::vector<float> floatInputYData{ 5., 7.,
+ 6., 8.,
- 9, 10,
- 11, 12 };
- std::vector<T> inputYData{ 5, 7,
- 6, 8,
+ 13., 15.,
+ 14., 16. };
+ std::vector<T> inputYData = armnnUtils::QuantizedVector<T>(floatInputYData, qScale, qOffset);
- 13, 15,
- 14, 16 };
- std::vector<T> expectedOutput{ 19, 22,
- 43, 50,
+ std::vector<float> floatExpectedOutputData{ 19., 22.,
+ 43., 50.,
- 267, 286,
- 323, 346 };
+ 267., 286.,
+ 323., 346. };
+ std::vector<T> expectedOutputData = armnnUtils::QuantizedVector<T>(floatExpectedOutputData, qScale, qOffset);
- std::map<int, std::vector<T>> inputTensorData = {{ 0, inputXData }, {1, inputYData}};
- std::map<int, std::vector<T>> expectedOutputData = { { 0, expectedOutput } };
+ std::map<int, std::vector<T>> inputTensor = {{ 0, inputXData }, {1, inputYData}};
+ std::map<int, std::vector<T>> expectedOutput = { { 0, expectedOutputData } };
- EndToEndLayerTestImpl<ArmnnType, ArmnnType>(std::move(network), inputTensorData, expectedOutputData, backends);
+ EndToEndLayerTestImpl<ArmnnType, ArmnnType>(std::move(network), inputTensor, expectedOutput, backends);
}
} // anonymous namespace \ No newline at end of file
diff --git a/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp
index 74bd97f103..504ca1d304 100644
--- a/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/BatchMatMulTestImpl.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -14,6 +14,7 @@
#include <armnnUtils/QuantizeHelper.hpp>
#include <armnnTestUtils/TensorCopyUtils.hpp>
#include <armnn/Optional.hpp>
+#include <armnn/BackendHelper.hpp>
template<armnn::DataType ArmnnType, typename T, std::size_t NumDims>
@@ -29,6 +30,7 @@ LayerTestResult<T, NumDims> BatchMatMulTestImpl(
const armnn::TensorInfo& inputYInfo,
const armnn::TensorInfo& outputInfo)
{
+ LayerTestResult<T, NumDims> result(outputInfo);
std::vector<T> outputActual(outputInfo.GetNumElements());
std::unique_ptr<armnn::ITensorHandle> inputXHandle = tensorHandleFactory.CreateTensorHandle(inputXInfo);
@@ -36,13 +38,27 @@ LayerTestResult<T, NumDims> BatchMatMulTestImpl(
std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputInfo);
armnn::BatchMatMulQueueDescriptor queueDescriptor;
- queueDescriptor.m_Parameters = descriptor;
+ queueDescriptor.m_Parameters = std::move(descriptor);
armnn::WorkloadInfo workloadInfo;
AddInputToWorkload(queueDescriptor, workloadInfo, inputXInfo, inputXHandle.get());
AddInputToWorkload(queueDescriptor, workloadInfo, inputYInfo, inputYHandle.get());
AddOutputToWorkload(queueDescriptor, workloadInfo, outputInfo, outputHandle.get());
+ // Don't execute if BatchMatMul is not supported, as an exception will be raised.
+ const armnn::BackendId& backend = workloadFactory.GetBackendId();
+ std::string reasonIfUnsupported;
+ armnn::LayerSupportHandle handle = armnn::GetILayerSupportByBackendId(backend);
+ result.m_Supported = handle.IsBatchMatMulSupported(inputXInfo,
+ inputYInfo,
+ outputInfo,
+ queueDescriptor.m_Parameters,
+ reasonIfUnsupported);
+ if (!result.m_Supported)
+ {
+ return result;
+ }
+
auto workload = workloadFactory.CreateWorkload(armnn::LayerType::BatchMatMul, queueDescriptor, workloadInfo);
inputXHandle->Allocate();
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 4e4d7fa73d..cd4dca8edb 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -760,12 +760,28 @@ bool NeonLayerSupport::IsBatchMatMulSupported(const TensorInfo& inputX,
const BatchMatMulDescriptor& descriptor,
Optional<std::string&> reasonIfUnsupported) const
{
+ bool isFastMathEnabled = false;
+#if defined(ARMCOMPUTENEON_ENABLED)
+ if (m_ModelContextPtr)
+ {
+ if (m_ModelContextPtr.get() != nullptr)
+ {
+ auto modelOptions = dynamic_cast<NeonBackendModelContext*>(m_ModelContextPtr.get());
+ if (modelOptions)
+ {
+ isFastMathEnabled = modelOptions->IsFastMathEnabled();
+ }
+ }
+ }
+#endif
FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchMatMulValidate,
reasonIfUnsupported,
inputX,
inputY,
output,
- descriptor);
+ descriptor,
+ isFastMathEnabled,
+ nullptr);
}
bool NeonLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input,
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index 08168eca2f..c78b58d21d 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -155,7 +155,19 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateWorkload(LayerType type,
case LayerType::BatchMatMul :
{
auto batchMatMulQueueDescriptor = PolymorphicDowncast<const BatchMatMulQueueDescriptor*>(&descriptor);
- return std::make_unique<NeonBatchMatMulWorkload>(*batchMatMulQueueDescriptor, info);
+ bool isFastMathEnabled = false;
+ if (m_ModelContextPtr)
+ {
+ if (m_ModelContextPtr.get() != nullptr)
+ {
+ auto modelOptions = dynamic_cast<NeonBackendModelContext*>(m_ModelContextPtr.get());
+ if (modelOptions)
+ {
+ isFastMathEnabled = modelOptions->IsFastMathEnabled();
+ }
+ }
+ }
+ return std::make_unique<NeonBatchMatMulWorkload>(*batchMatMulQueueDescriptor, info, isFastMathEnabled);
}
case LayerType::BatchNormalization :
{
diff --git a/src/backends/neon/test/NeonEndToEndTests.cpp b/src/backends/neon/test/NeonEndToEndTests.cpp
index b930004215..fb05cc415f 100644
--- a/src/backends/neon/test/NeonEndToEndTests.cpp
+++ b/src/backends/neon/test/NeonEndToEndTests.cpp
@@ -148,6 +148,11 @@ TEST_CASE("NeonBatchMatMulEndToEndFloat32Test")
BatchMatMulEndToEnd<armnn::DataType::Float32>(neonDefaultBackends);
}
+TEST_CASE("NeonBatchMatMulEndToEndInt8Test")
+{
+ BatchMatMulEndToEnd<armnn::DataType::QAsymmS8>(neonDefaultBackends);
+}
+
TEST_CASE("NeonConcatEndToEndDim0Test")
{
ConcatDim0EndToEnd<armnn::DataType::Float32>(neonDefaultBackends);
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 4e6b6fa1d6..715060717f 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -76,20 +76,32 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNchwUint7, BatchToSpaceNdNchwTest7<D
// Batch Mat Mul
ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DSimpleFloat32, BatchMatMul2DSimpleTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DSimpleInt8, BatchMatMul2DSimpleTest<DataType::QAsymmS8>);
ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DSimpleFloat32, BatchMatMul3DSimpleTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DSimpleInt8, BatchMatMul3DSimpleTest<DataType::QAsymmS8>);
ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMulNCHWSimpleFloat32, BatchMatMulNCHWSimpleTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMulNCHWSimpleInt8, BatchMatMulNCHWSimpleTest<DataType::QAsymmS8>);
ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBatchFloat32, BatchMatMul3DBatchTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBatchInt8, BatchMatMul3DBatchTest<DataType::QAsymmS8>);
-ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3DBroadcastFloat32, BatchMatMul3DBroadcastTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3DBroadcastFloat32,
+ BatchMatMul3DBroadcastTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3DBroadcastInt8,
+ BatchMatMul3DBroadcastTest<DataType::QAsymmS8>);
-ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul3D2DBroadcastFloat32, BatchMatMul3D2DBroadcastTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3D2DBroadcastFloat32,
+ BatchMatMul3D2DBroadcastTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(UNSUPPORTED_BatchMatMul3D2DBroadcastInt8,
+ BatchMatMul3D2DBroadcastTest<DataType::QAsymmS8>);
ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTinyFloat32, BatchMatMul2DTinyTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTinyInt8, BatchMatMul2DTinyTest<DataType::QAsymmS8>);
ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTranspSimpleFloat32, BatchMatMul2DTranspSimpleTest<DataType::Float32>);
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchMatMul2DTranspSimpleInt8, BatchMatMul2DTranspSimpleTest<DataType::QAsymmS8>);
// Convolution
ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvolution1d, Convolution1dTest, true)
diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp
index 3d8651f995..9b22033bd1 100644
--- a/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp
+++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -8,22 +8,18 @@
#include "NeonWorkloadUtils.hpp"
#include <armnn/utility/PolymorphicDowncast.hpp>
-
-#include <armnnUtils/Permute.hpp>
+#include <aclCommon/ArmComputeUtils.hpp>
#include <backendsCommon/WorkloadUtils.hpp>
-#include <arm_compute/runtime/NEON/functions/NEGEMM.h>
-
-#include <arm_compute/runtime/NEON/functions/NEPermute.h>
-
-
namespace armnn
{
-arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX,
- const TensorInfo& inputY,
- const TensorInfo& output,
- const BatchMatMulDescriptor& descriptor)
+arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputInfoX,
+ const TensorInfo& inputInfoY,
+ const TensorInfo& outputInfo,
+ const BatchMatMulDescriptor& descriptor,
+ const bool isFastMathEnabled,
+ const ActivationDescriptor* activationDescriptor)
{
if (descriptor.m_AdjointX || descriptor.m_AdjointY )
{
@@ -34,157 +30,78 @@ arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX,
throw Exception("Only supported the MatMul in the last 2 dimensions");
}
- const auto aclInputXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputX, descriptor.m_DataLayoutX);
- const auto aclInputYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputY, descriptor.m_DataLayoutY);
- const auto aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+ arm_compute::TensorInfo aclInputInfoX = armcomputetensorutils::BuildArmComputeTensorInfo(inputInfoX);
+ arm_compute::TensorInfo aclInputInfoY = armcomputetensorutils::BuildArmComputeTensorInfo(inputInfoY);
+ arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(outputInfo);
- arm_compute::Status statusGEMM = arm_compute::Status(arm_compute::ErrorCode::OK);
- arm_compute::Status statusPermuteX = arm_compute::Status(arm_compute::ErrorCode::OK);
- arm_compute::Status statusPermuteY = arm_compute::Status(arm_compute::ErrorCode::OK);
+ // GeMM dispatches kernel handles dynamic inputs differently to static so this flag needs to be set
+ aclInputInfoX.set_are_values_constant(false);
+ aclInputInfoY.set_are_values_constant(false);
- arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo();
- arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo();
+ const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo(
+ activationDescriptor);
- if (descriptor.m_TransposeX == true)
- {
- auto permutationXVector = GeneratePermutationVectorOnLastTwoDimensions(inputX.GetNumDimensions());
- const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector);
- const TensorInfo permutedXInfo = armnnUtils::Permuted(inputX, permutationXVector);
- aclPermutedXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedXInfo);
-
- statusPermuteX = arm_compute::NEPermute::validate(&aclInputXInfo,
- &aclPermutedXInfo,
- aclPermutationXVector);
- }
+ arm_compute::MatMulInfo matMulInfo;
+ matMulInfo.adj_lhs(descriptor.m_TransposeX);
+ matMulInfo.adj_rhs(descriptor.m_TransposeY);
+ matMulInfo.fused_activation(activationInfo);
- if (descriptor.m_TransposeY == true)
- {
- auto permutationYVector = GeneratePermutationVectorOnLastTwoDimensions(inputY.GetNumDimensions());
- const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector);
- const TensorInfo permutedYInfo = armnnUtils::Permuted(inputY, permutationYVector);
- aclPermutedYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedYInfo);
-
- statusPermuteY = arm_compute::NEPermute::validate(&aclInputYInfo,
- &aclPermutedYInfo,
- aclPermutationYVector);
- }
-
- const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped
- false, // is inputY reshaped
- false); // is inputY reshaped only 1st run
-
- statusGEMM = arm_compute::NEGEMM::validate(descriptor.m_TransposeX ? &aclPermutedXInfo : &aclInputXInfo,
- descriptor.m_TransposeY ? &aclPermutedYInfo : &aclInputYInfo,
- nullptr,
- &aclOutputInfo,
- 1.0,
- 0,
- gemm_info);
-
- if (statusPermuteX.error_code() == arm_compute::ErrorCode::OK &&
- statusPermuteY.error_code() == arm_compute::ErrorCode::OK &&
- statusGEMM.error_code() == arm_compute::ErrorCode::OK)
- {
- return arm_compute::Status(arm_compute::ErrorCode::OK,
- "All BatchMatMul layers validate status OK.");
- }
- else
- {
- return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
- "BatchMatMul layer validate status failed."
- + statusGEMM.error_description()
- + statusPermuteX.error_description()
- + statusPermuteY.error_description());
- }
+ arm_compute::CpuMatMulSettings settings;
+ settings.fast_math(isFastMathEnabled);
+ return arm_compute::NEMatMul::validate(&aclInputInfoX, &aclInputInfoY, &aclOutputInfo, matMulInfo, settings);
}
-NeonBatchMatMulWorkload::NeonBatchMatMulWorkload(
- const BatchMatMulQueueDescriptor& descriptor, const WorkloadInfo& info)
+NeonBatchMatMulWorkload::NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ const bool isFastMathEnabled)
: NeonBaseWorkload<BatchMatMulQueueDescriptor>(descriptor, info)
{
if (descriptor.m_Parameters.m_AdjointX || descriptor.m_Parameters.m_AdjointY )
{
throw Exception("Support for adjoint not implemented.");
}
- if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW ||
- descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW )
+ if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW
+ || descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW )
{
throw Exception("Only supported the MatMul in the last 2 dimensions");
}
- // Report Profiling Details
- ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct",
- descriptor.m_Parameters,
- info,
- this->GetGuid());
-
m_Data.ValidateInputsOutputs("NeonBatchMatMulWorkload", 2, 1);
arm_compute::ITensor& inputX = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ITensor& inputY = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
- auto outputHandle = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0]);
- arm_compute::ITensor& output = outputHandle->GetTensor();
+ arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
- arm_compute::DataLayout aclDataLayoutX = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutX);
- arm_compute::DataLayout aclDataLayoutY = ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutY);
+ // GeMM dispatches kernel handles dynamic inputs differently to static so this flag needs to be set
+ inputX.info()->set_are_values_constant(false);
+ inputY.info()->set_are_values_constant(false);
- inputX.info()->set_data_layout(aclDataLayoutX);
- inputY.info()->set_data_layout(aclDataLayoutY);
+ const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
- if (descriptor.m_Parameters.m_TransposeX == true)
- {
- armnn::PermutationVector permutationXVector
- = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions());
- const TensorInfo permutedXInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationXVector);
- const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector);
-
- auto permuteLayerX = std::make_unique<arm_compute::NEPermute>();
- BuildArmComputeTensor(m_PermutedTensorX, permutedXInfo);
- InitialiseArmComputeTensorEmpty(m_PermutedTensorX);
- permuteLayerX->configure(&inputX, &m_PermutedTensorX, aclPermutationXVector);
- m_PermuteLayerX.reset(permuteLayerX.release());
- }
+ arm_compute::MatMulInfo matMulInfo;
+ matMulInfo.adj_lhs(descriptor.m_Parameters.m_TransposeX);
+ matMulInfo.adj_rhs(descriptor.m_Parameters.m_TransposeY);
+ matMulInfo.fused_activation(activationInfo);
- if (descriptor.m_Parameters.m_TransposeY == true)
- {
- armnn::PermutationVector permutationYVector
- = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[1].GetNumDimensions());
- const TensorInfo permutedYInfo = armnnUtils::Permuted(info.m_InputTensorInfos[1], permutationYVector);
- const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector);
-
- auto permuteLayerY = std::make_unique<arm_compute::NEPermute>();
- BuildArmComputeTensor(m_PermutedTensorY, permutedYInfo);
- InitialiseArmComputeTensorEmpty(m_PermutedTensorY);
- permuteLayerY->configure(&inputY, &m_PermutedTensorY, aclPermutationYVector);
- m_PermuteLayerY.reset(permuteLayerY.release());
- }
+ arm_compute::CpuMatMulSettings settings;
+ settings.fast_math(isFastMathEnabled);
+
+ m_MatMulLayer.configure(&inputX, &inputY, &output, matMulInfo, settings);
- const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped
- false, // is inputY reshaped
- false); // is inputY reshaped only 1st run
- auto gemmLayer = std::make_unique<arm_compute::NEGEMM>();
- gemmLayer->configure(descriptor.m_Parameters.m_TransposeX ? &m_PermutedTensorX : &inputX,
- descriptor.m_Parameters.m_TransposeY ? &m_PermutedTensorY : &inputY,
- nullptr,
- &output,
- 1.0,
- 0,
- gemm_info);
- m_GEMMLayer.reset(gemmLayer.release());
+ // Report Profiling Details
+ WorkloadInfo detailsInfo;
+ detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
+ detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos;
+ ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchMatMulWorkload_Construct",
+ descriptor.m_Parameters,
+ detailsInfo,
+ GetGuid());
}
void NeonBatchMatMulWorkload::Execute() const
{
ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonBatchMatMulWorkload_Execute", this->GetGuid());
- if (m_PermuteLayerX)
- {
- m_PermuteLayerX->run();
- }
- if (m_PermuteLayerY)
- {
- m_PermuteLayerY->run();
- }
- m_GEMMLayer->run();
+ m_MatMulLayer.run();
}
} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp
index cb004d2478..27144f2400 100644
--- a/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp
+++ b/src/backends/neon/workloads/NeonBatchMatMulWorkload.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -7,35 +7,27 @@
#include "NeonBaseWorkload.hpp"
-#include <arm_compute/runtime/IFunction.h>
-#include <arm_compute/runtime/Tensor.h>
-
-#include <memory>
+#include <arm_compute/runtime/NEON/functions/NEMatMul.h>
namespace armnn
{
- arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputX,
- const TensorInfo& inputY,
- const TensorInfo& output,
- const BatchMatMulDescriptor& descriptor);
+ arm_compute::Status NeonBatchMatMulValidate(const TensorInfo& inputInfoX,
+ const TensorInfo& inputInfoY,
+ const TensorInfo& outputInfo,
+ const BatchMatMulDescriptor& descriptor,
+ const bool isFastMathEnabled,
+ const ActivationDescriptor* activationDescriptor);
+
class NeonBatchMatMulWorkload : public NeonBaseWorkload<BatchMatMulQueueDescriptor>
{
public:
NeonBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor,
- const WorkloadInfo& info);
+ const WorkloadInfo& info,
+ const bool isFastMathEnabled);
virtual void Execute() const override;
private:
- // ACL layers required to fully form a Batch Mat Mul layer.
- std::unique_ptr<arm_compute::IFunction> m_GEMMLayer;
- std::unique_ptr<arm_compute::IFunction> m_PermuteLayerX;
- std::unique_ptr<arm_compute::IFunction> m_PermuteLayerY;
-
- // Additional ACL arm_compute::Tensors.
- // Required to perform permutations.
- arm_compute::Tensor m_PermutedTensorX;
- arm_compute::Tensor m_PermutedTensorY;
-
+ mutable arm_compute::NEMatMul m_MatMulLayer;
};
} //namespace armnn
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 8bf414fdb0..95004c4dc2 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -486,6 +486,11 @@ TEST_CASE("RefBatchMatMulEndToEndFloat32Test")
BatchMatMulEndToEnd<armnn::DataType::Float32>(defaultBackends);
}
+TEST_CASE("RefBatchMatMulEndToEndInt8Test")
+{
+ BatchMatMulEndToEnd<armnn::DataType::QAsymmS8>(defaultBackends);
+}
+
TEST_CASE("RefBatchToSpaceNdEndToEndFloat32NHWCTest")
{
BatchToSpaceNdEndToEnd<armnn::DataType::Float32>(defaultBackends, armnn::DataLayout::NHWC);