aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTeresa Charlin <teresa.charlinreyes@arm.com>2022-10-19 08:48:07 +0100
committerTeresaARM <teresa.charlinreyes@arm.com>2023-01-09 15:09:46 +0000
commit94916a5c06065bca0b232106bd4ae68f9986b7b0 (patch)
tree1510f0e09f48305ce3a2c32e7adfddcc2da7b39a
parentc998108fbd4c134286b481768a873c54ae744a70 (diff)
downloadarmnn-94916a5c06065bca0b232106bd4ae68f9986b7b0.tar.gz
IVGCVSW-6493 Add GpuAcc Batch MatMul workload Fp32
* GpuAcc only supports up to 3D, so no 4D test have been added Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com> Change-Id: Ie926cd45c350be624cbdc6cb27c89d2d3f60884b
-rw-r--r--delegate/src/test/BatchMatMulTest.cpp17
-rw-r--r--docs/02_operator_list.dox9
-rw-r--r--src/backends/cl/ClLayerSupport.cpp21
-rw-r--r--src/backends/cl/ClLayerSupport.hpp6
-rw-r--r--src/backends/cl/ClWorkloadFactory.cpp5
-rw-r--r--src/backends/cl/backend.mk1
-rw-r--r--src/backends/cl/test/ClLayerTests.cpp23
-rw-r--r--src/backends/cl/workloads/CMakeLists.txt2
-rw-r--r--src/backends/cl/workloads/ClBatchMatMulWorkload.cpp203
-rw-r--r--src/backends/cl/workloads/ClBatchMatMulWorkload.hpp41
-rw-r--r--src/backends/cl/workloads/ClWorkloads.hpp1
11 files changed, 324 insertions, 5 deletions
diff --git a/delegate/src/test/BatchMatMulTest.cpp b/delegate/src/test/BatchMatMulTest.cpp
index e5cb976c45..d13d8dcf43 100644
--- a/delegate/src/test/BatchMatMulTest.cpp
+++ b/delegate/src/test/BatchMatMulTest.cpp
@@ -268,7 +268,7 @@ namespace armnnDelegate
{
// Set input data
std::vector<int32_t> LHSInputShape { 2,2,2 };
- std::vector<int32_t> RHSInputShape { 1,2,2 };
+ std::vector<int32_t> RHSInputShape { 2,2 };
std::vector<int32_t> outputShape { 2,2,2 };
std::vector<float> LHSInputValues = { 1, 2,
@@ -670,4 +670,19 @@ namespace armnnDelegate
BatchMatMul2DFp32SimpleAdjointTest(backends);
}
}
+ TEST_SUITE("BATCH_MATMUL_GpuAccTests")
+ {
+ TEST_CASE("BATCH_MATMUL_Fp32_GpuAccTests")
+ {
+ std::vector <armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+ BatchMatMul2DFp32SimpleTest (backends);
+ BatchMatMul3DFp32SimpleTest (backends);
+ BatchMatMul3DFp32BatchTest (backends);
+ BatchMatMul3DFp32BroadcastTest (backends);
+ BatchMatMul3D2DFp32BroadcastTest (backends);
+ BatchMatMul2DFp32TinyTest (backends);
+ BatchMatMulNonSquareFp32Test (backends);
+ BatchMatMul2DFp32SimpleAdjointTest(backends);
+ }
+ }
}
diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox
index d9a3d2c83b..007d4f5e35 100644
--- a/docs/02_operator_list.dox
+++ b/docs/02_operator_list.dox
@@ -304,12 +304,13 @@ where N = batches, C = channels, H = height, W = width
<td>GpuAcc
<td>
<ul>
- <li>N/A
+ <li>All
</ul>
<td>
- <ul>
- <li>N/A
- </ul>
+ <table>
+ <tr><th>
+ <tr><td>FLOAT32
+ </table>
<tr>
<td rowspan="3">BatchNormalizationLayer
<td rowspan="3" style="width:200px;"> Layer to perform batch normalization.
diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp
index a61a5bb640..cb2d756037 100644
--- a/src/backends/cl/ClLayerSupport.cpp
+++ b/src/backends/cl/ClLayerSupport.cpp
@@ -22,6 +22,7 @@
#include "workloads/ClAdditionWorkload.hpp"
#include "workloads/ClActivationWorkload.hpp"
#include "workloads/ClArgMinMaxWorkload.hpp"
+#include "workloads/ClBatchMatMulWorkload.hpp"
#include "workloads/ClBatchNormalizationFloatWorkload.hpp"
#include "workloads/ClBatchToSpaceNdWorkload.hpp"
#include "workloads/ClCastWorkload.hpp"
@@ -201,6 +202,12 @@ bool ClLayerSupport::IsLayerSupported(const LayerType& type,
infos[1],
*(PolymorphicDowncast<const ArgMinMaxDescriptor*>(&descriptor)),
reasonIfUnsupported);
+ case LayerType::BatchMatMul:
+ return IsBatchMatMulSupported(infos[0],
+ infos[1],
+ infos[2],
+ *(PolymorphicDowncast<const BatchMatMulDescriptor*>(&descriptor)),
+ reasonIfUnsupported);
case LayerType::BatchNormalization:
return IsBatchNormalizationSupported(infos[0],
infos[1],
@@ -640,6 +647,20 @@ bool ClLayerSupport::IsArgMinMaxSupported(const TensorInfo& input,
descriptor);
}
+bool ClLayerSupport::IsBatchMatMulSupported(const TensorInfo& inputX,
+ const TensorInfo& inputY,
+ const TensorInfo& output,
+ const BatchMatMulDescriptor& descriptor,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClBatchMatMulValidate,
+ reasonIfUnsupported,
+ inputX,
+ inputY,
+ output,
+ descriptor);
+}
+
bool ClLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input,
const TensorInfo& output,
const TensorInfo& mean,
diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp
index 27311f74aa..2d784e3df8 100644
--- a/src/backends/cl/ClLayerSupport.hpp
+++ b/src/backends/cl/ClLayerSupport.hpp
@@ -40,6 +40,12 @@ public:
const ArgMinMaxDescriptor& descriptor,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+ bool IsBatchMatMulSupported(const TensorInfo& inputX,
+ const TensorInfo& inputY,
+ const TensorInfo& output,
+ const BatchMatMulDescriptor& descriptor,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const;
+
bool IsBatchNormalizationSupported(const TensorInfo& input,
const TensorInfo& output,
const TensorInfo& mean,
diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp
index d0079abd38..6bf510a2ef 100644
--- a/src/backends/cl/ClWorkloadFactory.cpp
+++ b/src/backends/cl/ClWorkloadFactory.cpp
@@ -265,6 +265,11 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateWorkload(LayerType type,
auto argMinMaxQueueDescriptor = PolymorphicDowncast<const ArgMinMaxQueueDescriptor*>(&descriptor);
return MakeWorkload<ClArgMinMaxWorkload>(*argMinMaxQueueDescriptor, info, m_CLCompileContext);
}
+ case LayerType::BatchMatMul :
+ {
+ auto batchMatMulQueueDescriptor = PolymorphicDowncast<const BatchMatMulQueueDescriptor*>(&descriptor);
+ return std::make_unique<ClBatchMatMulWorkload>(*batchMatMulQueueDescriptor, info, m_CLCompileContext);
+ }
case LayerType::BatchNormalization :
{
auto batchNormalizationQueueDescriptor
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 6fda16db05..1f97ae7cc8 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -30,6 +30,7 @@ BACKEND_SOURCES := \
workloads/ClActivationWorkload.cpp \
workloads/ClAdditionWorkload.cpp \
workloads/ClArgMinMaxWorkload.cpp \
+ workloads/ClBatchMatMulWorkload.cpp \
workloads/ClBatchNormalizationFloatWorkload.cpp \
workloads/ClBatchToSpaceNdWorkload.cpp \
workloads/ClCastWorkload.cpp \
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index 855697c9be..4ba2a9ec3b 100644
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -73,6 +73,29 @@ ARMNN_AUTO_TEST_FIXTURE_WITH_THF(Tanh, ClContextControlFixture, TanhTest)
// Elu Activation
ARMNN_AUTO_TEST_FIXTURE_WITH_THF(Elu, ClContextControlFixture, EluTest)
+// Batch Mat Mul
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul2DSimpleFloat32,
+ ClContextControlFixture,
+ BatchMatMul2DSimpleTest<DataType::Float32>);
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3DSimpleFloat32,
+ ClContextControlFixture,
+ BatchMatMul3DSimpleTest<DataType::Float32>);
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3DBatchFloat32,
+ ClContextControlFixture,
+ BatchMatMul3DBatchTest<DataType::Float32>);
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3DBroadcastFloat32,
+ ClContextControlFixture,
+ BatchMatMul3DBroadcastTest<DataType::Float32>);
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul3D2DBroadcastFloat32,
+ ClContextControlFixture,
+ BatchMatMul3D2DBroadcastTest<DataType::Float32>);
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul2DTinyFloat32,
+ ClContextControlFixture,
+ BatchMatMul2DTinyTest<DataType::Float32>);
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchMatMul2DTranspSimpleFloat32,
+ ClContextControlFixture,
+ BatchMatMul2DTranspSimpleTest<DataType::Float32>);
+
// Batch To Space
ARMNN_AUTO_TEST_FIXTURE_WITH_THF(BatchToSpaceNdNhwcFloat321,
ClContextControlFixture,
diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt
index aef7fc7ad2..8616dec078 100644
--- a/src/backends/cl/workloads/CMakeLists.txt
+++ b/src/backends/cl/workloads/CMakeLists.txt
@@ -12,6 +12,8 @@ list(APPEND armnnClBackendWorkloads_sources
ClAdditionWorkload.hpp
ClArgMinMaxWorkload.cpp
ClArgMinMaxWorkload.hpp
+ ClBatchMatMulWorkload.cpp
+ ClBatchMatMulWorkload.hpp
ClBatchNormalizationFloatWorkload.cpp
ClBatchNormalizationFloatWorkload.hpp
ClBatchToSpaceNdWorkload.cpp
diff --git a/src/backends/cl/workloads/ClBatchMatMulWorkload.cpp b/src/backends/cl/workloads/ClBatchMatMulWorkload.cpp
new file mode 100644
index 0000000000..4acdef5e5c
--- /dev/null
+++ b/src/backends/cl/workloads/ClBatchMatMulWorkload.cpp
@@ -0,0 +1,203 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClBatchMatMulWorkload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <aclCommon/ArmComputeUtils.hpp>
+
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
+#include <armnnUtils/Permute.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+#include <cl/ClTensorHandle.hpp>
+
+#include <arm_compute/runtime/CL/functions/CLGEMM.h>
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+
+
+namespace armnn
+{
+arm_compute::Status ClBatchMatMulValidate(const TensorInfo& inputX,
+ const TensorInfo& inputY,
+ const TensorInfo& output,
+ const BatchMatMulDescriptor& descriptor)
+{
+ if (descriptor.m_AdjointX || descriptor.m_AdjointY )
+ {
+ throw Exception("Support for adjoint not implemented.");
+ }
+ if (descriptor.m_DataLayoutX != armnn::DataLayout::NCHW || descriptor.m_DataLayoutY != armnn::DataLayout::NCHW )
+ {
+ throw Exception("Only supported the MatMul in the last 2 dimensions");
+ }
+
+ arm_compute::Status statusGEMM = arm_compute::Status(arm_compute::ErrorCode::OK);
+ arm_compute::Status statusPermuteX = arm_compute::Status(arm_compute::ErrorCode::OK);
+ arm_compute::Status statusPermuteY = arm_compute::Status(arm_compute::ErrorCode::OK);
+
+ const auto aclInputXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputX, descriptor.m_DataLayoutX);
+ const auto aclInputYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(inputY, descriptor.m_DataLayoutY);
+ const auto aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo();
+ arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo();
+
+ if (descriptor.m_TransposeX == true)
+ {
+ auto permutationXVector = GeneratePermutationVectorOnLastTwoDimensions(inputX.GetNumDimensions());
+ const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector);
+ const TensorInfo permutedXInfo = armnnUtils::Permuted(inputX, permutationXVector);
+ aclPermutedXInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedXInfo);
+
+ statusPermuteX = arm_compute::CLPermute::validate(&aclInputXInfo,
+ &aclPermutedXInfo,
+ aclPermutationXVector);
+ }
+
+ if ( descriptor.m_TransposeY == true)
+ {
+ auto permutationYVector = GeneratePermutationVectorOnLastTwoDimensions(inputY.GetNumDimensions());
+ const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector);
+ const TensorInfo permutedYInfo = armnnUtils::Permuted(inputY, permutationYVector);
+ aclPermutedYInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permutedYInfo);
+
+ statusPermuteY = arm_compute::CLPermute::validate(&aclInputYInfo,
+ &aclPermutedYInfo,
+ aclPermutationYVector);
+
+ }
+
+ const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped
+ false, // is inputY reshaped
+ false); // is inputY reshaped only 1st run
+
+
+ statusGEMM = arm_compute::CLGEMM::validate(descriptor.m_TransposeX ? &aclPermutedXInfo : &aclInputXInfo,
+ descriptor.m_TransposeY ? &aclPermutedYInfo : &aclInputYInfo,
+ nullptr,
+ &aclOutputInfo,
+ 1.0,
+ 0,
+ gemm_info);
+
+ if (statusPermuteX.error_code() == arm_compute::ErrorCode::OK &&
+ statusPermuteY.error_code() == arm_compute::ErrorCode::OK &&
+ statusGEMM.error_code() == arm_compute::ErrorCode::OK)
+ {
+ return arm_compute::Status(arm_compute::ErrorCode::OK,
+ "All Batch Mat Mul layers validate status OK.");
+ }
+ else
+ {
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
+ "BatchMatMul layer validate status failed."
+ + statusGEMM.error_description()
+ + statusPermuteX.error_description()
+ + statusPermuteY.error_description());
+ }
+
+}
+
+ClBatchMatMulWorkload::ClBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ const arm_compute::CLCompileContext& clCompileContext)
+ : ClBaseWorkload<BatchMatMulQueueDescriptor>(descriptor, info)
+{
+ // Report Profiling Details
+ ARMNN_REPORT_PROFILING_WORKLOAD_DESC("ClBatchMatMulWorkload_Construct",
+ descriptor.m_Parameters,
+ info,
+ this->GetGuid());
+
+ if (descriptor.m_Parameters.m_AdjointX || descriptor.m_Parameters.m_AdjointY )
+ {
+ throw Exception("Support for adjoint not implemented.");
+ }
+ if (descriptor.m_Parameters.m_DataLayoutX != armnn::DataLayout::NCHW ||
+ descriptor.m_Parameters.m_DataLayoutY != armnn::DataLayout::NCHW )
+ {
+ throw Exception("Only supported the MatMul in the last 2 dimensions");
+ }
+
+ m_Data.ValidateInputsOutputs("ClBatchMatMulWorkload", 2, 1);
+
+ const arm_compute::ICLTensor& inputX = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ const arm_compute::ICLTensor& inputY = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ inputX.info()->set_data_layout(armcomputetensorutils::ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutX));
+ inputY.info()->set_data_layout(armcomputetensorutils::ConvertDataLayout(m_Data.m_Parameters.m_DataLayoutY));
+
+ arm_compute::TensorInfo aclPermutedXInfo = arm_compute::TensorInfo();
+ arm_compute::TensorInfo aclPermutedYInfo = arm_compute::TensorInfo();
+
+ if (descriptor.m_Parameters.m_TransposeX == true)
+ {
+ armnn::PermutationVector permutationXVector
+ = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions());
+ const TensorInfo permutedXInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationXVector);
+ const auto aclPermutationXVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationXVector);
+ armcomputetensorutils::BuildArmComputeTensor(m_PermutedTensorX, permutedXInfo);
+ armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermutedTensorX);
+
+ auto permuteLayerX = std::make_unique<arm_compute::CLPermute>();
+ permuteLayerX->configure(clCompileContext,
+ &inputX,
+ &m_PermutedTensorX,
+ aclPermutationXVector);
+ m_PermuteLayerX.reset(permuteLayerX.release());
+ }
+
+ if (descriptor.m_Parameters.m_TransposeY == true)
+ {
+ armnn::PermutationVector permutationYVector
+ = GeneratePermutationVectorOnLastTwoDimensions(info.m_InputTensorInfos[0].GetNumDimensions());
+ const TensorInfo permutedYInfo = armnnUtils::Permuted(info.m_InputTensorInfos[0], permutationYVector);
+ const auto aclPermutationYVector = armcomputetensorutils::BuildArmComputePermutationVector(permutationYVector);
+ armcomputetensorutils::BuildArmComputeTensor(m_PermutedTensorY, permutedYInfo);
+ armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermutedTensorY);
+
+ std::unique_ptr<arm_compute::CLPermute> permuteLayerY(new arm_compute::CLPermute());
+ permuteLayerY->configure(clCompileContext,
+ &inputY,
+ &m_PermutedTensorY,
+ aclPermutationYVector);
+ m_PermuteLayerY.reset(permuteLayerY.release());
+ }
+
+ const arm_compute::GEMMInfo& gemm_info = arm_compute::GEMMInfo(false, // is inputX reshaped
+ false, // is inputY reshaped
+ false); // is inputY reshaped only 1st run
+ auto gemmLayer = std::make_unique<arm_compute::CLGEMM>();
+ gemmLayer->configure(clCompileContext,
+ descriptor.m_Parameters.m_TransposeX ? &m_PermutedTensorX : &inputX,
+ descriptor.m_Parameters.m_TransposeY ? &m_PermutedTensorY : &inputY,
+ nullptr,
+ &output,
+ 1.0,
+ 0,
+ gemm_info);
+ m_GEMMLayer.reset(gemmLayer.release());
+}
+
+void ClBatchMatMulWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL_GUID("ClBatchMatMulWorkload_Execute", this->GetGuid());
+ if (m_PermuteLayerX)
+ {
+ m_PermuteLayerX->run();
+ }
+ if (m_PermuteLayerY)
+ {
+ m_PermuteLayerY->run();
+ }
+ m_GEMMLayer->run();
+}
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp b/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp
new file mode 100644
index 0000000000..5277efc947
--- /dev/null
+++ b/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseWorkload.hpp"
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <memory>
+
+namespace armnn
+{
+ arm_compute::Status ClBatchMatMulValidate(const TensorInfo& inputX,
+ const TensorInfo& inputY,
+ const TensorInfo& output,
+ const BatchMatMulDescriptor& descriptor);
+
+ class ClBatchMatMulWorkload : public ClBaseWorkload<BatchMatMulQueueDescriptor>
+ {
+ public:
+ ClBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ const arm_compute::CLCompileContext& clCompileContext);
+ virtual void Execute() const override;
+
+ private:
+ // ACL layers required to fully form a Batch Mat Mul layer.
+ std::unique_ptr<arm_compute::IFunction> m_GEMMLayer;
+ std::unique_ptr<arm_compute::IFunction> m_PermuteLayerX;
+ std::unique_ptr<arm_compute::IFunction> m_PermuteLayerY;
+
+ // Additional CL arm_compute::Tensors.
+ // Required to perform permutations.
+ arm_compute::CLTensor m_PermutedTensorX;
+ arm_compute::CLTensor m_PermutedTensorY;
+
+ };
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp
index c3a79b7583..44f3798d7d 100644
--- a/src/backends/cl/workloads/ClWorkloads.hpp
+++ b/src/backends/cl/workloads/ClWorkloads.hpp
@@ -10,6 +10,7 @@
#include "ClArgMinMaxWorkload.hpp"
#include "ClComparisonWorkload.hpp"
#include "ClConstantWorkload.hpp"
+#include "ClBatchMatMulWorkload.hpp"
#include "ClBatchNormalizationFloatWorkload.hpp"
#include "ClBatchToSpaceNdWorkload.hpp"
#include "ClCastWorkload.hpp"