diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp | 258 |
1 files changed, 75 insertions, 183 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp index 943e0ac148..5b4b6b9b2e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp @@ -149,12 +149,11 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x27, x27, #0x10\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z11.s, z0.s[3]\n" "bgt 9b\n" @@ -164,7 +163,6 @@ void sve_hybrid_fp32_mla_8x1VL ( "subs x28, x28, #0x1\n" "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "add x27, x27, #0x10\n" "addvl x12, x12, #1\n" "ble 11f\n" "ld1w { z9.s }, p2/Z, [x12]\n" @@ -181,9 +179,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z24.s, z11.s, z0.s[3]\n" "addvl x12, x12, #1\n" "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 6b\n" "tbz %x[flags], #1, 12f\n" @@ -254,18 +251,16 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x26, x26, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" "fmla z25.s, z11.s, z1.s[3]\n" @@ -277,16 +272,14 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "add x26, x26, #0x10\n" "addvl x12, x12, #1\n" + "fmla z25.s, z8.s, z1.s[0]\n" "ble 24f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" "subs x28, x28, #0x1\n" - "fmla z25.s, z9.s, z1.s[1]\n" "addvl x12, x12, #1\n" + "fmla z25.s, z9.s, z1.s[1]\n" "ble 24f\n" "ld1w { z10.s }, p2/Z, [x12]\n" "fmla z24.s, z10.s, z0.s[2]\n" @@ -299,10 +292,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "addvl x12, x12, #1\n" "fmla z25.s, z11.s, z1.s[3]\n" "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 19b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -385,22 +376,19 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x25, x25, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z26.s, z9.s, z2.s[1]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" @@ -415,13 +403,10 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "add x25, x25, #0x10\n" - "addvl x12, x12, #1\n" "ble 37f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -443,11 +428,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z25.s, z11.s, z1.s[3]\n" "fmla z26.s, z11.s, z2.s[3]\n" "37:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 32b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -541,27 +523,23 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" @@ -578,16 +556,12 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" "fmla z27.s, z8.s, z3.s[0]\n" - "add x24, x24, #0x10\n" - "addvl x12, x12, #1\n" "ble 50f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -612,12 +586,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z26.s, z11.s, z2.s[3]\n" "fmla z27.s, z11.s, z3.s[3]\n" "50:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 45b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -722,33 +692,28 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -766,19 +731,14 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" "fmla z28.s, z8.s, z4.s[0]\n" - "add x23, x23, #0x10\n" - "addvl x12, x12, #1\n" "ble 63f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -806,13 +766,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z27.s, z11.s, z3.s[3]\n" "fmla z28.s, z11.s, z4.s[3]\n" "63:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 58b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -928,38 +883,32 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x22, x22, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -979,22 +928,16 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" "fmla z29.s, z8.s, z5.s[0]\n" - "add x22, x22, #0x10\n" - "addvl x12, x12, #1\n" "ble 76f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -1025,14 +968,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z28.s, z11.s, z4.s[3]\n" "fmla z29.s, z11.s, z5.s[3]\n" "76:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 71b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1159,43 +1096,36 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x21, x21, #0x10\n" "fmla z30.s, z8.s, z6.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z30.s, z9.s, z6.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -1217,25 +1147,18 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z29.s, z8.s, z5.s[0]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z29.s, z8.s, z5.s[0]\n" "fmla z30.s, z8.s, z6.s[0]\n" - "add x21, x21, #0x10\n" - "addvl x12, x12, #1\n" "ble 89f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -1269,15 +1192,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z29.s, z11.s, z5.s[3]\n" "fmla z30.s, z11.s, z6.s[3]\n" "89:" // Height 7: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 84b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -1418,48 +1334,40 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "cmp x28, #0x4\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" + "add x26, x26, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" + "add x25, x25, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" + "add x24, x24, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" + "add x23, x23, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1rqw { z7.s }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "add x22, x22, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "add x20, x20, #0x10\n" + "add x21, x21, #0x10\n" "fmla z30.s, z8.s, z6.s[0]\n" "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "cmp x28, #0x4\n" + "add x20, x20, #0x10\n" "fmla z31.s, z8.s, z7.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla z30.s, z9.s, z6.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla z31.s, z9.s, z7.s[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x20, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -1483,28 +1391,20 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" "ld1rqw { z1.s }, p0/Z, [x26]\n" - "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" "fmla z25.s, z8.s, z1.s[0]\n" "ld1rqw { z2.s }, p0/Z, [x25]\n" - "add x26, x26, #0x10\n" - "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x25, x25, #0x10\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" "ld1rqw { z4.s }, p0/Z, [x23]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z27.s, z8.s, z3.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x22]\n" - "add x23, x23, #0x10\n" - "fmla z29.s, z8.s, z5.s[0]\n" "ld1rqw { z6.s }, p0/Z, [x21]\n" - "add x22, x22, #0x10\n" - "fmla z30.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z4.s[0]\n" "ld1rqw { z7.s }, p0/Z, [x20]\n" - "add x21, x21, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "fmla z30.s, z8.s, z6.s[0]\n" "fmla z31.s, z8.s, z7.s[0]\n" - "add x20, x20, #0x10\n" - "addvl x12, x12, #1\n" "ble 102f\n" "ld1w { z9.s }, p2/Z, [x12]\n" "fmla z24.s, z9.s, z0.s[1]\n" @@ -1541,16 +1441,8 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z30.s, z11.s, z6.s[3]\n" "fmla z31.s, z11.s, z7.s[3]\n" "102:" // Height 8: Multiply loop: multiply skip - "prfm pldl1keep, [x27, #0x80]\n" - "add x9, x9, #0x1\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x9, x9, #0x1\n" "cmp x9, x19\n" "bne 97b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" |