diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp | 149 |
1 files changed, 43 insertions, 106 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp index 34a657f64f..b794c21807 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp @@ -162,13 +162,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "cmp x26, #0x8\n" + "add x25, x25, #0x10\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" @@ -203,7 +202,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -242,9 +240,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 6b\n" "tbz %x[flags], #1, 12f\n" @@ -348,16 +345,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x8\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" @@ -408,9 +403,7 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" @@ -465,10 +458,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 19b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -602,21 +593,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "cmp x26, #0x8\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -681,12 +669,9 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" @@ -756,11 +741,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" "37:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -923,26 +905,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" + "add x22, x22, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -1021,19 +999,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "add x22, x22, #0x10\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" @@ -1114,12 +1088,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" "50:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 45b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1311,32 +1281,27 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" "add x21, x21, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" @@ -1428,22 +1393,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "add x21, x21, #0x10\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" @@ -1539,13 +1499,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" "63:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 58b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1769,37 +1724,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "cmp x26, #0x8\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" "add x20, x20, #0x10\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x26, #0x8\n" ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" @@ -1905,25 +1854,19 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" "ld1rqh { z1.h }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" "ld1rqh { z2.h }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z3.h }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" + ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" "ld1rqh { z4.h }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "add x20, x20, #0x10\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" @@ -2034,14 +1977,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" "76:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x25, #0x80]\n" - "add x27, x27, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" "cmp x27, x19\n" "bne 71b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -2153,4 +2090,4 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE |