aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl
diff options
context:
space:
mode:
authorRamy Elgammal <ramy.elgammal@arm.com>2023-05-19 14:23:37 +0100
committerRamy Elgammal <ramy.elgammal@arm.com>2023-06-23 20:06:45 +0000
commitc952596e70f2fe0073029f053e329a4e930ced8c (patch)
tree1cf9b1c87c2288d6af436b570802d9cc6e8b30b5 /src/gpu/cl
parent47a50ef12f513cfa8fde6673b8a61ed0f2d0fbaa (diff)
downloadComputeLibrary-c952596e70f2fe0073029f053e329a4e930ced8c.tar.gz
Implement FP32/FP16 MatMul NT/T kernel using the MMUL extension
Resolves COMPMID-6195 Signed-off-by: ramy.elgammal@arm.com <ramy.elgammal@arm.com> Change-Id: I8e85fe73308ed84ebb142d6d6d1562b62dddfaa5 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9819 Reviewed-by: SiCong Li <sicong.li@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl')
-rw-r--r--src/gpu/cl/ClKernelLibrary.cpp1
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp26
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h1
3 files changed, 16 insertions, 12 deletions
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 408f1f7a21..5355cb7402 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -320,6 +320,7 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "l2_normalize_y", "common/l2_normalize.cl" },
{ "l2_normalize_z", "common/l2_normalize.cl" },
{ "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" },
+ { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" },
{ "mat_mul_native_nt_nt", "common/mat_mul.cl" },
{ "mat_mul_native_nt_t", "common/mat_mul.cl" },
{ "mat_mul_native_t_nt", "common/mat_mul.cl" },
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
index 32e69cabda..06a0bdee17 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -60,12 +60,11 @@ inline std::pair<int, int> adjust_m0_n0(int m0, int n0, int m, int n)
Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
{
const bool adj_lhs = matmul_kernel_info.adj_lhs;
- const bool adj_rhs = matmul_kernel_info.adj_rhs;
- const int m0 = matmul_kernel_info.m0;
- const int n0 = matmul_kernel_info.n0;
- const int k0 = matmul_kernel_info.k0;
+ const int m0 = matmul_kernel_info.m0;
+ const int n0 = matmul_kernel_info.n0;
+ const int k0 = matmul_kernel_info.k0;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((adj_lhs || adj_rhs), "adj_lhs and adj_rhs are not supported yet");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((adj_lhs), "adj_lhs is not supported yet");
// Validate M0
ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
@@ -84,7 +83,7 @@ Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rh
{
ARM_COMPUTE_UNUSED(matmul_kernel_info);
const size_t lhs_k = lhs_shape.x();
- const size_t rhs_k = rhs_shape.y();
+ const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
@@ -177,9 +176,11 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
const int m = dst->dimension(1);
const int n = dst->dimension(0);
- const int k = lhs->tensor_shape().x();
- _m = m;
- _n = n;
+ const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+ _m = m;
+ _n = n;
+ _k = k;
int m0{};
int n0{};
@@ -199,15 +200,15 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
build_opts.add_option_if(lhs->data_type() == DataType::F16, "-DHALF_PRECISION");
build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
- build_opts.add_option("-DK=" + support::cpp11::to_string(k));
- std::string kernel_name("mat_mul_native_mmul_nt_nt");
+ std::string kernel_name("mat_mul_native_mmul");
+ kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+ kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -250,6 +251,7 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window
// Pass m and n at runtime as signed ints, to ensure results of any subtractions they could be operand in, would still be signed.
_kernel.setArg<cl_int>(idx++, _m);
_kernel.setArg<cl_int>(idx++, _n);
+ _kernel.setArg<cl_int>(idx++, _k);
// LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
// LWS also enforces the order of execution of the work items which improves cache utilization
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
index 26fe08c466..79f675d03b 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -86,6 +86,7 @@ public:
private:
int _m{ 1 };
int _n{ 1 };
+ int _k{ 1 };
};
} // namespace kernels
} // namespace opencl