aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp144
1 files changed, 72 insertions, 72 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
index 36bfccf52f..1e3f2f300b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
@@ -51,27 +51,27 @@ void a64_ffinterleaved_fp16_mla_8x24(
__asm__ __volatile__(
"1:" // Height loop
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
- "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "mov x23, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+ "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "mov x24, %x[Apanel]\n"
"2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+ "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
"ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
- "add x22, x25, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
"add x21, x22, x20, LSL #1\n"
"add x20, x21, x20, LSL #1\n"
"str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
- "cmp x24, #0x10\n"
- "mov %x[Apanel], x23\n"
+ "cmp x25, #0x10\n"
+ "mov %x[Apanel], x24\n"
"bgt 3f\n"
- "cmp x24, #0x8\n"
- "mov x21, x25\n"
+ "cmp x25, #0x8\n"
+ "mov x21, x23\n"
"bgt 3f\n"
- "mov x22, x25\n"
+ "mov x22, x23\n"
"3:" // B setup done
"ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q2, [x25, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
"movi v8.16b, #0x0\n"
"ldr q3, [x22, #0x0]\n"
"ldr q4, [x21, #0x0]\n"
@@ -102,11 +102,11 @@ void a64_ffinterleaved_fp16_mla_8x24(
"movi v31.16b, #0x0\n"
"blt 5f\n"
"4:" // main loop head
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q5, [x25, #0x10]\n"
+ "ldr q7, [%x[Apanel], #0x10]\n"
+ "ldr q6, [x23, #0x10]\n"
"fmla v8.8h, v2.8h, v0.h[0]\n"
- "ldr q6, [x22, #0x10]\n"
- "ldr q7, [x21, #0x10]\n"
+ "ldr q5, [x22, #0x10]\n"
+ "ldr q1, [x21, #0x10]\n"
"fmla v11.8h, v2.8h, v0.h[1]\n"
"fmla v14.8h, v2.8h, v0.h[2]\n"
"fmla v17.8h, v2.8h, v0.h[3]\n"
@@ -119,8 +119,8 @@ void a64_ffinterleaved_fp16_mla_8x24(
"add %x[Apanel], %x[Apanel], #0x20\n"
"fmla v9.8h, v3.8h, v0.h[0]\n"
"fmla v12.8h, v3.8h, v0.h[1]\n"
- "add x25, x25, #0x20\n"
- "ldr q2, [x25, #0x0]\n"
+ "add x23, x23, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v15.8h, v3.8h, v0.h[2]\n"
"fmla v18.8h, v3.8h, v0.h[3]\n"
"fmla v21.8h, v3.8h, v0.h[4]\n"
@@ -140,30 +140,30 @@ void a64_ffinterleaved_fp16_mla_8x24(
"fmla v31.8h, v4.8h, v0.h[7]\n"
"ldr q0, [%x[Apanel], #0x0]\n"
"ldr q4, [x21, #0x0]\n"
- "fmla v8.8h, v5.8h, v1.h[0]\n"
- "fmla v11.8h, v5.8h, v1.h[1]\n"
- "fmla v14.8h, v5.8h, v1.h[2]\n"
- "fmla v17.8h, v5.8h, v1.h[3]\n"
- "fmla v20.8h, v5.8h, v1.h[4]\n"
- "fmla v23.8h, v5.8h, v1.h[5]\n"
- "fmla v26.8h, v5.8h, v1.h[6]\n"
- "fmla v29.8h, v5.8h, v1.h[7]\n"
- "fmla v9.8h, v6.8h, v1.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v15.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v1.h[3]\n"
- "fmla v21.8h, v6.8h, v1.h[4]\n"
- "fmla v24.8h, v6.8h, v1.h[5]\n"
- "fmla v27.8h, v6.8h, v1.h[6]\n"
- "fmla v30.8h, v6.8h, v1.h[7]\n"
- "fmla v10.8h, v7.8h, v1.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v16.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v1.h[3]\n"
- "fmla v22.8h, v7.8h, v1.h[4]\n"
- "fmla v25.8h, v7.8h, v1.h[5]\n"
- "fmla v28.8h, v7.8h, v1.h[6]\n"
- "fmla v31.8h, v7.8h, v1.h[7]\n"
+ "fmla v8.8h, v6.8h, v7.h[0]\n"
+ "fmla v11.8h, v6.8h, v7.h[1]\n"
+ "fmla v14.8h, v6.8h, v7.h[2]\n"
+ "fmla v17.8h, v6.8h, v7.h[3]\n"
+ "fmla v20.8h, v6.8h, v7.h[4]\n"
+ "fmla v23.8h, v6.8h, v7.h[5]\n"
+ "fmla v26.8h, v6.8h, v7.h[6]\n"
+ "fmla v29.8h, v6.8h, v7.h[7]\n"
+ "fmla v9.8h, v5.8h, v7.h[0]\n"
+ "fmla v12.8h, v5.8h, v7.h[1]\n"
+ "fmla v15.8h, v5.8h, v7.h[2]\n"
+ "fmla v18.8h, v5.8h, v7.h[3]\n"
+ "fmla v21.8h, v5.8h, v7.h[4]\n"
+ "fmla v24.8h, v5.8h, v7.h[5]\n"
+ "fmla v27.8h, v5.8h, v7.h[6]\n"
+ "fmla v30.8h, v5.8h, v7.h[7]\n"
+ "fmla v10.8h, v1.8h, v7.h[0]\n"
+ "fmla v13.8h, v1.8h, v7.h[1]\n"
+ "fmla v16.8h, v1.8h, v7.h[2]\n"
+ "fmla v19.8h, v1.8h, v7.h[3]\n"
+ "fmla v22.8h, v1.8h, v7.h[4]\n"
+ "fmla v25.8h, v1.8h, v7.h[5]\n"
+ "fmla v28.8h, v1.8h, v7.h[6]\n"
+ "fmla v31.8h, v1.8h, v7.h[7]\n"
"bge 4b\n"
"5:" // main loop skip
"fmla v8.8h, v2.8h, v0.h[0]\n"
@@ -171,7 +171,7 @@ void a64_ffinterleaved_fp16_mla_8x24(
"add %x[Apanel], %x[Apanel], #0x10\n"
"fmla v14.8h, v2.8h, v0.h[2]\n"
"fmla v17.8h, v2.8h, v0.h[3]\n"
- "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v20.8h, v2.8h, v0.h[4]\n"
"fmla v23.8h, v2.8h, v0.h[5]\n"
"add x22, x22, #0x10\n"
@@ -195,37 +195,37 @@ void a64_ffinterleaved_fp16_mla_8x24(
"fmla v28.8h, v4.8h, v0.h[6]\n"
"fmla v31.8h, v4.8h, v0.h[7]\n"
"cbz x20, 6f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q5, [x25, #0x0]\n"
- "fmla v8.8h, v5.8h, v0.h[0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q7, [x21, #0x0]\n"
- "fmla v11.8h, v5.8h, v0.h[1]\n"
- "fmla v14.8h, v5.8h, v0.h[2]\n"
- "fmla v17.8h, v5.8h, v0.h[3]\n"
+ "ldr q3, [%x[Apanel], #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "fmla v8.8h, v2.8h, v3.h[0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q0, [x21, #0x0]\n"
+ "fmla v11.8h, v2.8h, v3.h[1]\n"
+ "fmla v14.8h, v2.8h, v3.h[2]\n"
+ "fmla v17.8h, v2.8h, v3.h[3]\n"
"add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v20.8h, v5.8h, v0.h[4]\n"
- "fmla v23.8h, v5.8h, v0.h[5]\n"
- "fmla v26.8h, v5.8h, v0.h[6]\n"
- "fmla v29.8h, v5.8h, v0.h[7]\n"
- "fmla v9.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v0.h[1]\n"
- "fmla v15.8h, v6.8h, v0.h[2]\n"
- "fmla v18.8h, v6.8h, v0.h[3]\n"
- "fmla v21.8h, v6.8h, v0.h[4]\n"
- "fmla v24.8h, v6.8h, v0.h[5]\n"
- "fmla v27.8h, v6.8h, v0.h[6]\n"
- "fmla v30.8h, v6.8h, v0.h[7]\n"
- "fmla v10.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v0.h[1]\n"
- "fmla v16.8h, v7.8h, v0.h[2]\n"
- "fmla v19.8h, v7.8h, v0.h[3]\n"
- "fmla v22.8h, v7.8h, v0.h[4]\n"
- "fmla v25.8h, v7.8h, v0.h[5]\n"
- "fmla v28.8h, v7.8h, v0.h[6]\n"
- "fmla v31.8h, v7.8h, v0.h[7]\n"
+ "fmla v20.8h, v2.8h, v3.h[4]\n"
+ "fmla v23.8h, v2.8h, v3.h[5]\n"
+ "fmla v26.8h, v2.8h, v3.h[6]\n"
+ "fmla v29.8h, v2.8h, v3.h[7]\n"
+ "fmla v9.8h, v1.8h, v3.h[0]\n"
+ "fmla v12.8h, v1.8h, v3.h[1]\n"
+ "fmla v15.8h, v1.8h, v3.h[2]\n"
+ "fmla v18.8h, v1.8h, v3.h[3]\n"
+ "fmla v21.8h, v1.8h, v3.h[4]\n"
+ "fmla v24.8h, v1.8h, v3.h[5]\n"
+ "fmla v27.8h, v1.8h, v3.h[6]\n"
+ "fmla v30.8h, v1.8h, v3.h[7]\n"
+ "fmla v10.8h, v0.8h, v3.h[0]\n"
+ "fmla v13.8h, v0.8h, v3.h[1]\n"
+ "fmla v16.8h, v0.8h, v3.h[2]\n"
+ "fmla v19.8h, v0.8h, v3.h[3]\n"
+ "fmla v22.8h, v0.8h, v3.h[4]\n"
+ "fmla v25.8h, v0.8h, v3.h[5]\n"
+ "fmla v28.8h, v0.8h, v3.h[6]\n"
+ "fmla v31.8h, v0.8h, v3.h[7]\n"
"6:" // multiply loop done
- "subs x24, x24, #0x18\n"
+ "subs x25, x25, #0x18\n"
"str q8, [%x[Cpanel], #0x0]\n"
"str q9, [%x[Cpanel], #0x10]\n"
"str q10, [%x[Cpanel], #0x20]\n"