aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp222
1 files changed, 111 insertions, 111 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
index ec99d64f4a..6de0a380eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
@@ -51,29 +51,29 @@ void a64_ffinterleaved_fp32_mla_8x12(
__asm__ __volatile__(
"1:" // Height loop
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
- "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x23, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "add x22, x25, x20, LSL #2\n"
+ "add x22, x23, x20, LSL #2\n"
"add x21, x22, x20, LSL #2\n"
"add x20, x21, x20, LSL #2\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "cmp x24, #0x8\n"
- "mov %x[Apanel], x23\n"
+ "cmp x25, #0x8\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
- "cmp x24, #0x4\n"
- "mov x21, x25\n"
+ "cmp x25, #0x4\n"
+ "mov x21, x23\n"
"bgt 3f\n"
- "mov x22, x25\n"
+ "mov x22, x23\n"
"3:" // B setup done
"ldr q0, [%x[Apanel], #0x0]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
"movi v8.16b, #0x0\n"
- "ldr q4, [x25, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
"ldr q5, [x22, #0x0]\n"
"movi v9.16b, #0x0\n"
"ldr q6, [x21, #0x0]\n"
@@ -103,10 +103,10 @@ void a64_ffinterleaved_fp32_mla_8x12(
"movi v31.16b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ldr q2, [%x[Apanel], #0x20]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
+ "ldr q3, [%x[Apanel], #0x20]\n"
+ "ldr q7, [%x[Apanel], #0x30]\n"
"fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr q7, [x25, #0x10]\n"
+ "ldr q2, [x23, #0x10]\n"
"fmla v11.4s, v4.4s, v0.s[1]\n"
"fmla v14.4s, v4.4s, v0.s[2]\n"
"fmla v17.4s, v4.4s, v0.s[3]\n"
@@ -136,36 +136,36 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v28.4s, v6.4s, v1.s[2]\n"
"fmla v31.4s, v6.4s, v1.s[3]\n"
"ldr q1, [%x[Apanel], #0x50]\n"
- "ldr q6, [x25, #0x20]\n"
- "fmla v8.4s, v7.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v2.s[1]\n"
- "fmla v14.4s, v7.4s, v2.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "fmla v20.4s, v7.4s, v3.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "fmla v26.4s, v7.4s, v3.s[2]\n"
- "fmla v29.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x22, #0x20]\n"
- "fmla v9.4s, v4.4s, v2.s[0]\n"
- "fmla v12.4s, v4.4s, v2.s[1]\n"
- "fmla v15.4s, v4.4s, v2.s[2]\n"
- "fmla v18.4s, v4.4s, v2.s[3]\n"
- "fmla v21.4s, v4.4s, v3.s[0]\n"
- "fmla v24.4s, v4.4s, v3.s[1]\n"
- "fmla v27.4s, v4.4s, v3.s[2]\n"
- "fmla v30.4s, v4.4s, v3.s[3]\n"
+ "ldr q6, [x23, #0x20]\n"
+ "fmla v8.4s, v2.4s, v3.s[0]\n"
+ "fmla v11.4s, v2.4s, v3.s[1]\n"
+ "fmla v14.4s, v2.4s, v3.s[2]\n"
+ "fmla v17.4s, v2.4s, v3.s[3]\n"
+ "fmla v20.4s, v2.4s, v7.s[0]\n"
+ "fmla v23.4s, v2.4s, v7.s[1]\n"
+ "fmla v26.4s, v2.4s, v7.s[2]\n"
+ "fmla v29.4s, v2.4s, v7.s[3]\n"
+ "ldr q2, [x22, #0x20]\n"
+ "fmla v9.4s, v4.4s, v3.s[0]\n"
+ "fmla v12.4s, v4.4s, v3.s[1]\n"
+ "fmla v15.4s, v4.4s, v3.s[2]\n"
+ "fmla v18.4s, v4.4s, v3.s[3]\n"
+ "fmla v21.4s, v4.4s, v7.s[0]\n"
+ "fmla v24.4s, v4.4s, v7.s[1]\n"
+ "fmla v27.4s, v4.4s, v7.s[2]\n"
+ "fmla v30.4s, v4.4s, v7.s[3]\n"
"ldr q4, [x21, #0x20]\n"
- "fmla v10.4s, v5.4s, v2.s[0]\n"
- "fmla v13.4s, v5.4s, v2.s[1]\n"
- "fmla v16.4s, v5.4s, v2.s[2]\n"
- "fmla v19.4s, v5.4s, v2.s[3]\n"
- "ldr q2, [%x[Apanel], #0x60]\n"
- "fmla v22.4s, v5.4s, v3.s[0]\n"
- "fmla v25.4s, v5.4s, v3.s[1]\n"
- "fmla v28.4s, v5.4s, v3.s[2]\n"
- "fmla v31.4s, v5.4s, v3.s[3]\n"
- "ldr q3, [%x[Apanel], #0x70]\n"
- "ldr q5, [x25, #0x30]\n"
+ "fmla v10.4s, v5.4s, v3.s[0]\n"
+ "fmla v13.4s, v5.4s, v3.s[1]\n"
+ "fmla v16.4s, v5.4s, v3.s[2]\n"
+ "fmla v19.4s, v5.4s, v3.s[3]\n"
+ "ldr q3, [%x[Apanel], #0x60]\n"
+ "fmla v22.4s, v5.4s, v7.s[0]\n"
+ "fmla v25.4s, v5.4s, v7.s[1]\n"
+ "fmla v28.4s, v5.4s, v7.s[2]\n"
+ "fmla v31.4s, v5.4s, v7.s[3]\n"
+ "ldr q7, [%x[Apanel], #0x70]\n"
+ "ldr q5, [x23, #0x30]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
"fmla v11.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v0.s[2]\n"
@@ -173,20 +173,20 @@ void a64_ffinterleaved_fp32_mla_8x12(
"add %x[Apanel], %x[Apanel], #0x80\n"
"fmla v20.4s, v6.4s, v1.s[0]\n"
"fmla v23.4s, v6.4s, v1.s[1]\n"
- "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
"fmla v26.4s, v6.4s, v1.s[2]\n"
"fmla v29.4s, v6.4s, v1.s[3]\n"
"ldr q6, [x22, #0x30]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v12.4s, v7.4s, v0.s[1]\n"
+ "fmla v9.4s, v2.4s, v0.s[0]\n"
+ "fmla v12.4s, v2.4s, v0.s[1]\n"
"add x22, x22, #0x40\n"
- "fmla v15.4s, v7.4s, v0.s[2]\n"
- "fmla v18.4s, v7.4s, v0.s[3]\n"
- "fmla v21.4s, v7.4s, v1.s[0]\n"
- "fmla v24.4s, v7.4s, v1.s[1]\n"
- "fmla v27.4s, v7.4s, v1.s[2]\n"
- "fmla v30.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x21, #0x30]\n"
+ "fmla v15.4s, v2.4s, v0.s[2]\n"
+ "fmla v18.4s, v2.4s, v0.s[3]\n"
+ "fmla v21.4s, v2.4s, v1.s[0]\n"
+ "fmla v24.4s, v2.4s, v1.s[1]\n"
+ "fmla v27.4s, v2.4s, v1.s[2]\n"
+ "fmla v30.4s, v2.4s, v1.s[3]\n"
+ "ldr q2, [x21, #0x30]\n"
"fmla v10.4s, v4.4s, v0.s[0]\n"
"fmla v13.4s, v4.4s, v0.s[1]\n"
"add x21, x21, #0x40\n"
@@ -198,33 +198,33 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v28.4s, v4.4s, v1.s[2]\n"
"fmla v31.4s, v4.4s, v1.s[3]\n"
"ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q4, [x25, #0x0]\n"
- "fmla v8.4s, v5.4s, v2.s[0]\n"
- "fmla v11.4s, v5.4s, v2.s[1]\n"
- "fmla v14.4s, v5.4s, v2.s[2]\n"
- "fmla v17.4s, v5.4s, v2.s[3]\n"
- "fmla v20.4s, v5.4s, v3.s[0]\n"
- "fmla v23.4s, v5.4s, v3.s[1]\n"
- "fmla v26.4s, v5.4s, v3.s[2]\n"
- "fmla v29.4s, v5.4s, v3.s[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "fmla v8.4s, v5.4s, v3.s[0]\n"
+ "fmla v11.4s, v5.4s, v3.s[1]\n"
+ "fmla v14.4s, v5.4s, v3.s[2]\n"
+ "fmla v17.4s, v5.4s, v3.s[3]\n"
+ "fmla v20.4s, v5.4s, v7.s[0]\n"
+ "fmla v23.4s, v5.4s, v7.s[1]\n"
+ "fmla v26.4s, v5.4s, v7.s[2]\n"
+ "fmla v29.4s, v5.4s, v7.s[3]\n"
"ldr q5, [x22, #0x0]\n"
- "fmla v9.4s, v6.4s, v2.s[0]\n"
- "fmla v12.4s, v6.4s, v2.s[1]\n"
- "fmla v15.4s, v6.4s, v2.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v21.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v3.s[1]\n"
- "fmla v27.4s, v6.4s, v3.s[2]\n"
- "fmla v30.4s, v6.4s, v3.s[3]\n"
+ "fmla v9.4s, v6.4s, v3.s[0]\n"
+ "fmla v12.4s, v6.4s, v3.s[1]\n"
+ "fmla v15.4s, v6.4s, v3.s[2]\n"
+ "fmla v18.4s, v6.4s, v3.s[3]\n"
+ "fmla v21.4s, v6.4s, v7.s[0]\n"
+ "fmla v24.4s, v6.4s, v7.s[1]\n"
+ "fmla v27.4s, v6.4s, v7.s[2]\n"
+ "fmla v30.4s, v6.4s, v7.s[3]\n"
"ldr q6, [x21, #0x0]\n"
- "fmla v10.4s, v7.4s, v2.s[0]\n"
- "fmla v13.4s, v7.4s, v2.s[1]\n"
- "fmla v16.4s, v7.4s, v2.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v22.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v3.s[1]\n"
- "fmla v28.4s, v7.4s, v3.s[2]\n"
- "fmla v31.4s, v7.4s, v3.s[3]\n"
+ "fmla v10.4s, v2.4s, v3.s[0]\n"
+ "fmla v13.4s, v2.4s, v3.s[1]\n"
+ "fmla v16.4s, v2.4s, v3.s[2]\n"
+ "fmla v19.4s, v2.4s, v3.s[3]\n"
+ "fmla v22.4s, v2.4s, v7.s[0]\n"
+ "fmla v25.4s, v2.4s, v7.s[1]\n"
+ "fmla v28.4s, v2.4s, v7.s[2]\n"
+ "fmla v31.4s, v2.4s, v7.s[3]\n"
"bge 4b\n"
"5:" // main loop skip
"fmla v8.4s, v4.4s, v0.s[0]\n"
@@ -232,7 +232,7 @@ void a64_ffinterleaved_fp32_mla_8x12(
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla v14.4s, v4.4s, v0.s[2]\n"
"fmla v17.4s, v4.4s, v0.s[3]\n"
- "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v20.4s, v4.4s, v1.s[0]\n"
"fmla v23.4s, v4.4s, v1.s[1]\n"
"add x22, x22, #0x10\n"
@@ -257,43 +257,43 @@ void a64_ffinterleaved_fp32_mla_8x12(
"fmla v31.4s, v6.4s, v1.s[3]\n"
"cbz x20, 7f\n"
"6:" // odd loop
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
+ "ldr q4, [%x[Apanel], #0x0]\n"
+ "ldr q3, [%x[Apanel], #0x10]\n"
"subs x20, x20, #0x1\n"
- "ldr q7, [x25, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "fmla v8.4s, v7.4s, v0.s[0]\n"
- "ldr q5, [x21, #0x0]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v14.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v0.s[3]\n"
- "fmla v20.4s, v7.4s, v1.s[0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "fmla v8.4s, v2.4s, v4.s[0]\n"
+ "ldr q0, [x21, #0x0]\n"
+ "fmla v11.4s, v2.4s, v4.s[1]\n"
+ "fmla v14.4s, v2.4s, v4.s[2]\n"
+ "fmla v17.4s, v2.4s, v4.s[3]\n"
+ "fmla v20.4s, v2.4s, v3.s[0]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v23.4s, v7.4s, v1.s[1]\n"
- "fmla v26.4s, v7.4s, v1.s[2]\n"
- "add x25, x25, #0x10\n"
- "fmla v29.4s, v7.4s, v1.s[3]\n"
- "fmla v9.4s, v4.4s, v0.s[0]\n"
+ "fmla v23.4s, v2.4s, v3.s[1]\n"
+ "fmla v26.4s, v2.4s, v3.s[2]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v29.4s, v2.4s, v3.s[3]\n"
+ "fmla v9.4s, v1.4s, v4.s[0]\n"
"add x22, x22, #0x10\n"
- "fmla v12.4s, v4.4s, v0.s[1]\n"
- "fmla v15.4s, v4.4s, v0.s[2]\n"
+ "fmla v12.4s, v1.4s, v4.s[1]\n"
+ "fmla v15.4s, v1.4s, v4.s[2]\n"
"add x21, x21, #0x10\n"
- "fmla v18.4s, v4.4s, v0.s[3]\n"
- "fmla v21.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v1.s[1]\n"
- "fmla v27.4s, v4.4s, v1.s[2]\n"
- "fmla v30.4s, v4.4s, v1.s[3]\n"
- "fmla v10.4s, v5.4s, v0.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[1]\n"
- "fmla v16.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v0.s[3]\n"
- "fmla v22.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v1.s[1]\n"
- "fmla v28.4s, v5.4s, v1.s[2]\n"
- "fmla v31.4s, v5.4s, v1.s[3]\n"
+ "fmla v18.4s, v1.4s, v4.s[3]\n"
+ "fmla v21.4s, v1.4s, v3.s[0]\n"
+ "fmla v24.4s, v1.4s, v3.s[1]\n"
+ "fmla v27.4s, v1.4s, v3.s[2]\n"
+ "fmla v30.4s, v1.4s, v3.s[3]\n"
+ "fmla v10.4s, v0.4s, v4.s[0]\n"
+ "fmla v13.4s, v0.4s, v4.s[1]\n"
+ "fmla v16.4s, v0.4s, v4.s[2]\n"
+ "fmla v19.4s, v0.4s, v4.s[3]\n"
+ "fmla v22.4s, v0.4s, v3.s[0]\n"
+ "fmla v25.4s, v0.4s, v3.s[1]\n"
+ "fmla v28.4s, v0.4s, v3.s[2]\n"
+ "fmla v31.4s, v0.4s, v3.s[3]\n"
"bne 6b\n"
"7:" // multiply loop done
- "subs x24, x24, #0xc\n"
+ "subs x25, x25, #0xc\n"
"str q8, [%x[Cpanel], #0x0]\n"
"str q9, [%x[Cpanel], #0x10]\n"
"str q10, [%x[Cpanel], #0x20]\n"