aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp258
1 files changed, 75 insertions, 183 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index 943e0ac148..5b4b6b9b2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -149,12 +149,11 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z24.s, z11.s, z0.s[3]\n"
"bgt 9b\n"
@@ -164,7 +163,6 @@ void sve_hybrid_fp32_mla_8x1VL (
"subs x28, x28, #0x1\n"
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "add x27, x27, #0x10\n"
"addvl x12, x12, #1\n"
"ble 11f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
@@ -181,9 +179,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z24.s, z11.s, z0.s[3]\n"
"addvl x12, x12, #1\n"
"11:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 6b\n"
"tbz %x[flags], #1, 12f\n"
@@ -254,18 +251,16 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x26, x26, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z24.s, z11.s, z0.s[3]\n"
"fmla z25.s, z11.s, z1.s[3]\n"
@@ -277,16 +272,14 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
- "fmla z25.s, z8.s, z1.s[0]\n"
- "add x26, x26, #0x10\n"
"addvl x12, x12, #1\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
"ble 24f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"subs x28, x28, #0x1\n"
- "fmla z25.s, z9.s, z1.s[1]\n"
"addvl x12, x12, #1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
"ble 24f\n"
"ld1w { z10.s }, p2/Z, [x12]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
@@ -299,10 +292,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"addvl x12, x12, #1\n"
"fmla z25.s, z11.s, z1.s[3]\n"
"24:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 19b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -385,22 +376,19 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x25, x25, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla z26.s, z9.s, z2.s[1]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z24.s, z11.s, z0.s[3]\n"
@@ -415,13 +403,10 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "add x25, x25, #0x10\n"
- "addvl x12, x12, #1\n"
"ble 37f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
@@ -443,11 +428,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z25.s, z11.s, z1.s[3]\n"
"fmla z26.s, z11.s, z2.s[3]\n"
"37:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 32b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -541,27 +523,23 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x24, x24, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z24.s, z11.s, z0.s[3]\n"
@@ -578,16 +556,12 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
"fmla z27.s, z8.s, z3.s[0]\n"
- "add x24, x24, #0x10\n"
- "addvl x12, x12, #1\n"
"ble 50f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
@@ -612,12 +586,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z26.s, z11.s, z2.s[3]\n"
"fmla z27.s, z11.s, z3.s[3]\n"
"50:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 45b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -722,33 +692,28 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x23, x23, #0x10\n"
"fmla z28.s, z8.s, z4.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -766,19 +731,14 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
"fmla z28.s, z8.s, z4.s[0]\n"
- "add x23, x23, #0x10\n"
- "addvl x12, x12, #1\n"
"ble 63f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
@@ -806,13 +766,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z27.s, z11.s, z3.s[3]\n"
"fmla z28.s, z11.s, z4.s[3]\n"
"63:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 58b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -928,38 +883,32 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z28.s, z8.s, z4.s[0]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x22, x22, #0x10\n"
"fmla z29.s, z8.s, z5.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"fmla z29.s, z9.s, z5.s[1]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -979,22 +928,16 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
"fmla z29.s, z8.s, z5.s[0]\n"
- "add x22, x22, #0x10\n"
- "addvl x12, x12, #1\n"
"ble 76f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
@@ -1025,14 +968,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z28.s, z11.s, z4.s[3]\n"
"fmla z29.s, z11.s, z5.s[3]\n"
"76:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 71b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1159,43 +1096,36 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
"ld1rqw { z6.s }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z28.s, z8.s, z4.s[0]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x21, x21, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z29.s, z8.s, z5.s[0]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x21, x21, #0x10\n"
"fmla z30.s, z8.s, z6.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla z29.s, z9.s, z5.s[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"fmla z30.s, z9.s, z6.s[1]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -1217,25 +1147,18 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
"ld1rqw { z6.s }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
"fmla z30.s, z8.s, z6.s[0]\n"
- "add x21, x21, #0x10\n"
- "addvl x12, x12, #1\n"
"ble 89f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
@@ -1269,15 +1192,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z29.s, z11.s, z5.s[3]\n"
"fmla z30.s, z11.s, z6.s[3]\n"
"89:" // Height 7: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 84b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1418,48 +1334,40 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
"ld1rqw { z6.s }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z28.s, z8.s, z4.s[0]\n"
"ld1rqw { z7.s }, p0/Z, [x20]\n"
- "add x21, x21, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z29.s, z8.s, z5.s[0]\n"
"ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
- "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
"fmla z30.s, z8.s, z6.s[0]\n"
"ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
- "cmp x28, #0x4\n"
+ "add x20, x20, #0x10\n"
"fmla z31.s, z8.s, z7.s[0]\n"
- "prfm pldl1keep, [x27, #0x80]\n"
"addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"fmla z29.s, z9.s, z5.s[1]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"fmla z30.s, z9.s, z6.s[1]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"fmla z31.s, z9.s, z7.s[1]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -1483,28 +1391,20 @@ void sve_hybrid_fp32_mla_8x1VL (
"ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
"ld1rqw { z1.s }, p0/Z, [x26]\n"
- "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
"fmla z25.s, z8.s, z1.s[0]\n"
"ld1rqw { z2.s }, p0/Z, [x25]\n"
- "add x26, x26, #0x10\n"
- "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
- "fmla z27.s, z8.s, z3.s[0]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
"ld1rqw { z4.s }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "fmla z29.s, z8.s, z5.s[0]\n"
"ld1rqw { z6.s }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
- "fmla z30.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
"ld1rqw { z7.s }, p0/Z, [x20]\n"
- "add x21, x21, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
"fmla z31.s, z8.s, z7.s[0]\n"
- "add x20, x20, #0x10\n"
- "addvl x12, x12, #1\n"
"ble 102f\n"
"ld1w { z9.s }, p2/Z, [x12]\n"
"fmla z24.s, z9.s, z0.s[1]\n"
@@ -1541,16 +1441,8 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z30.s, z11.s, z6.s[3]\n"
"fmla z31.s, z11.s, z7.s[3]\n"
"102:" // Height 8: Multiply loop: multiply skip
- "prfm pldl1keep, [x27, #0x80]\n"
- "add x9, x9, #0x1\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x9, x9, #0x1\n"
"cmp x9, x19\n"
"bne 97b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"