From 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 16 Jul 2021 16:16:43 +0100 Subject: Update GEMM assembly kernels - Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp') diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp index 6e3a00ed72..ba8a2ccb1d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp @@ -309,8 +309,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -693,8 +693,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -1193,8 +1193,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -1809,8 +1809,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -2541,8 +2541,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" @@ -3392,8 +3392,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "ld1r { v0.4s }, [x25]\n" "ld1r { v4.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "mov v2.16b, v0.16b\n" "mov v5.16b, v4.16b\n" + "mov v2.16b, v0.16b\n" "mov v6.16b, v4.16b\n" "mov v3.16b, v0.16b\n" "mov v7.16b, v4.16b\n" -- cgit v1.2.1