From aaa9da1efa83911c7a67d50811ad669a92a7d12f Mon Sep 17 00:00:00 2001 From: David Mansell Date: Fri, 10 Mar 2023 13:48:50 +0000 Subject: arm_gemm: Add SME2 FP16 kernels. Resolves: COMPMID-5966 Change-Id: Ic0d694493178da029a297643855bd0cff01b174f Signed-off-by: David Mansell Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9302 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins --- .../transforms/sme_transpose_interleave_1VL_2x2.hpp | 13 +++++++++++++ .../transforms/sme_transpose_interleave_2VL_2x2.hpp | 13 +++++++++++++ .../transforms/sme_transpose_interleave_4VL_2x2.hpp | 13 +++++++++++++ 3 files changed, 39 insertions(+) (limited to 'src/core/NEON/kernels/arm_gemm/transforms') diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp index dfa0167c00..7120d1d33e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp @@ -193,4 +193,17 @@ void Transform<1, 2, true, VLType::SME>( ); } +template<> +void Transform<1, 2, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_1VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + #endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp index 13e0a38ebc..3fc3920500 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp @@ -195,4 +195,17 @@ void Transform<2, 2, true, VLType::SME>( ); } +template<> +void Transform<2, 2, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_2VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + #endif diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp index 8badde53a9..9b28578217 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp @@ -155,4 +155,17 @@ void Transform<4, 2, true, VLType::SME>( ); } +template<> +void Transform<4, 2, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_4VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + #endif -- cgit v1.2.1