aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp149
1 files changed, 43 insertions, 106 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index 34a657f64f..b794c21807 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -162,13 +162,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x8\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "cmp x26, #0x8\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
@@ -203,7 +202,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -242,9 +240,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
"11:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
@@ -348,16 +345,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x8\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "cmp x26, #0x8\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
@@ -408,9 +403,7 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
@@ -465,10 +458,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 19b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -602,21 +593,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x8\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
"add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- "cmp x26, #0x8\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -681,12 +669,9 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "add x23, x23, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
@@ -756,11 +741,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
"37:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 32b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -923,26 +905,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x8\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x26, #0x8\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -1021,19 +999,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "add x22, x22, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"addvl x10, x10, #4\n"
@@ -1114,12 +1088,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
"50:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 45b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1311,32 +1281,27 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x8\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
"ld1rqh { z4.h }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
"add x21, x21, #0x10\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x26, #0x8\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
@@ -1428,22 +1393,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z4.h }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "add x21, x21, #0x10\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1539,13 +1499,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
"63:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 58b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1769,37 +1724,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x8\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
"ld1rqh { z4.h }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
"ld1rqh { z5.h }, p0/Z, [x20]\n"
- "add x21, x21, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
"add x20, x20, #0x10\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x26, #0x8\n"
".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
"ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
@@ -1905,25 +1854,19 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
"ld1rqh { z1.h }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
"ld1rqh { z2.h }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z3.h }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
"ld1rqh { z4.h }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
"ld1rqh { z5.h }, p0/Z, [x20]\n"
- "add x21, x21, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "add x20, x20, #0x10\n"
- ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
"ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
@@ -2034,14 +1977,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
"76:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 71b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -2153,4 +2090,4 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
}
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE