aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp122
1 files changed, 63 insertions, 59 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 94452929c6..e507bc5551 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
namespace arm_gemm {
void sve_interleaved_bf16fp32_dot_8x3VL(
- const bfloat16 *Apanel, const bfloat16 *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *Apanel,
+ const bfloat16 *Bpanel,
+ float *Cpanel,
+ int ablocks,
+ int bblocks,
+ int K) {
struct KernelArgs {
size_t K = {};
@@ -85,10 +89,10 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
"3:" // main loop head
".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n"
- "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+ "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n"
".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n"
".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n"
- "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
+ "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n"
".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n"
"sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
".inst 0x646940d9 // bfdot z25.s, z6.h, z1.h[1]\n"
".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n"
".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p0/Z, [x22, #5, MUL VL]\n"
+ "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
"addvl x22, x22, #6\n"
- ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
- ".inst 0x646a408b // bfdot z11.s, z4.h, z2.h[1]\n"
+ ".inst 0x64634088 // bfdot z8.s, z4.h, z3.h[0]\n"
+ ".inst 0x646b408b // bfdot z11.s, z4.h, z3.h[1]\n"
"ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
- ".inst 0x6472408e // bfdot z14.s, z4.h, z2.h[2]\n"
- ".inst 0x647a4091 // bfdot z17.s, z4.h, z2.h[3]\n"
- ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
- ".inst 0x646b4097 // bfdot z23.s, z4.h, z3.h[1]\n"
- ".inst 0x6473409a // bfdot z26.s, z4.h, z3.h[2]\n"
- ".inst 0x647b409d // bfdot z29.s, z4.h, z3.h[3]\n"
+ ".inst 0x6473408e // bfdot z14.s, z4.h, z3.h[2]\n"
+ ".inst 0x647b4091 // bfdot z17.s, z4.h, z3.h[3]\n"
+ ".inst 0x64674094 // bfdot z20.s, z4.h, z7.h[0]\n"
+ ".inst 0x646f4097 // bfdot z23.s, z4.h, z7.h[1]\n"
+ ".inst 0x6477409a // bfdot z26.s, z4.h, z7.h[2]\n"
+ ".inst 0x647f409d // bfdot z29.s, z4.h, z7.h[3]\n"
"ld1h { z4.h }, p0/Z, [x22]\n"
- ".inst 0x646240a9 // bfdot z9.s, z5.h, z2.h[0]\n"
- ".inst 0x646a40ac // bfdot z12.s, z5.h, z2.h[1]\n"
- ".inst 0x647240af // bfdot z15.s, z5.h, z2.h[2]\n"
- ".inst 0x647a40b2 // bfdot z18.s, z5.h, z2.h[3]\n"
- ".inst 0x646340b5 // bfdot z21.s, z5.h, z3.h[0]\n"
- ".inst 0x646b40b8 // bfdot z24.s, z5.h, z3.h[1]\n"
- ".inst 0x647340bb // bfdot z27.s, z5.h, z3.h[2]\n"
- ".inst 0x647b40be // bfdot z30.s, z5.h, z3.h[3]\n"
+ ".inst 0x646340a9 // bfdot z9.s, z5.h, z3.h[0]\n"
+ ".inst 0x646b40ac // bfdot z12.s, z5.h, z3.h[1]\n"
+ ".inst 0x647340af // bfdot z15.s, z5.h, z3.h[2]\n"
+ ".inst 0x647b40b2 // bfdot z18.s, z5.h, z3.h[3]\n"
+ ".inst 0x646740b5 // bfdot z21.s, z5.h, z7.h[0]\n"
+ ".inst 0x646f40b8 // bfdot z24.s, z5.h, z7.h[1]\n"
+ ".inst 0x647740bb // bfdot z27.s, z5.h, z7.h[2]\n"
+ ".inst 0x647f40be // bfdot z30.s, z5.h, z7.h[3]\n"
"ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x646240ca // bfdot z10.s, z6.h, z2.h[0]\n"
- ".inst 0x646a40cd // bfdot z13.s, z6.h, z2.h[1]\n"
- ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
- ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- ".inst 0x646b40d9 // bfdot z25.s, z6.h, z3.h[1]\n"
- ".inst 0x647340dc // bfdot z28.s, z6.h, z3.h[2]\n"
- ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
+ ".inst 0x6463404a // bfdot z10.s, z2.h, z3.h[0]\n"
+ ".inst 0x646b404d // bfdot z13.s, z2.h, z3.h[1]\n"
+ ".inst 0x64734050 // bfdot z16.s, z2.h, z3.h[2]\n"
+ ".inst 0x647b4053 // bfdot z19.s, z2.h, z3.h[3]\n"
+ ".inst 0x64674056 // bfdot z22.s, z2.h, z7.h[0]\n"
+ ".inst 0x646f4059 // bfdot z25.s, z2.h, z7.h[1]\n"
+ ".inst 0x6477405c // bfdot z28.s, z2.h, z7.h[2]\n"
+ ".inst 0x647f405f // bfdot z31.s, z2.h, z7.h[3]\n"
"ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
"bge 3b\n"
"4:" // main loop skip
@@ -174,37 +178,37 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n"
".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
"cbz x20, 5f\n"
- "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
- "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+ "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n"
+ "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
"add %x[Apanel], %x[Apanel], #0x20\n"
- "ld1h { z7.h }, p0/Z, [x22]\n"
- "ld1h { z4.h }, p0/Z, [x22, #1, MUL VL]\n"
- ".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n"
- "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
- ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- ".inst 0x647040ee // bfdot z14.s, z7.h, z0.h[2]\n"
- ".inst 0x647840f1 // bfdot z17.s, z7.h, z0.h[3]\n"
- ".inst 0x646140f4 // bfdot z20.s, z7.h, z1.h[0]\n"
+ "ld1h { z2.h }, p0/Z, [x22]\n"
+ "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x64644048 // bfdot z8.s, z2.h, z4.h[0]\n"
+ "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x646c404b // bfdot z11.s, z2.h, z4.h[1]\n"
+ ".inst 0x6474404e // bfdot z14.s, z2.h, z4.h[2]\n"
+ ".inst 0x647c4051 // bfdot z17.s, z2.h, z4.h[3]\n"
+ ".inst 0x64634054 // bfdot z20.s, z2.h, z3.h[0]\n"
"addvl x22, x22, #3\n"
- ".inst 0x646940f7 // bfdot z23.s, z7.h, z1.h[1]\n"
- ".inst 0x647140fa // bfdot z26.s, z7.h, z1.h[2]\n"
- ".inst 0x647940fd // bfdot z29.s, z7.h, z1.h[3]\n"
- ".inst 0x64604089 // bfdot z9.s, z4.h, z0.h[0]\n"
- ".inst 0x6468408c // bfdot z12.s, z4.h, z0.h[1]\n"
- ".inst 0x6470408f // bfdot z15.s, z4.h, z0.h[2]\n"
- ".inst 0x64784092 // bfdot z18.s, z4.h, z0.h[3]\n"
- ".inst 0x64614095 // bfdot z21.s, z4.h, z1.h[0]\n"
- ".inst 0x64694098 // bfdot z24.s, z4.h, z1.h[1]\n"
- ".inst 0x6471409b // bfdot z27.s, z4.h, z1.h[2]\n"
- ".inst 0x6479409e // bfdot z30.s, z4.h, z1.h[3]\n"
- ".inst 0x646040aa // bfdot z10.s, z5.h, z0.h[0]\n"
- ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
- ".inst 0x647040b0 // bfdot z16.s, z5.h, z0.h[2]\n"
- ".inst 0x647840b3 // bfdot z19.s, z5.h, z0.h[3]\n"
- ".inst 0x646140b6 // bfdot z22.s, z5.h, z1.h[0]\n"
- ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
- ".inst 0x647140bc // bfdot z28.s, z5.h, z1.h[2]\n"
- ".inst 0x647940bf // bfdot z31.s, z5.h, z1.h[3]\n"
+ ".inst 0x646b4057 // bfdot z23.s, z2.h, z3.h[1]\n"
+ ".inst 0x6473405a // bfdot z26.s, z2.h, z3.h[2]\n"
+ ".inst 0x647b405d // bfdot z29.s, z2.h, z3.h[3]\n"
+ ".inst 0x64644029 // bfdot z9.s, z1.h, z4.h[0]\n"
+ ".inst 0x646c402c // bfdot z12.s, z1.h, z4.h[1]\n"
+ ".inst 0x6474402f // bfdot z15.s, z1.h, z4.h[2]\n"
+ ".inst 0x647c4032 // bfdot z18.s, z1.h, z4.h[3]\n"
+ ".inst 0x64634035 // bfdot z21.s, z1.h, z3.h[0]\n"
+ ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n"
+ ".inst 0x6473403b // bfdot z27.s, z1.h, z3.h[2]\n"
+ ".inst 0x647b403e // bfdot z30.s, z1.h, z3.h[3]\n"
+ ".inst 0x6464400a // bfdot z10.s, z0.h, z4.h[0]\n"
+ ".inst 0x646c400d // bfdot z13.s, z0.h, z4.h[1]\n"
+ ".inst 0x64744010 // bfdot z16.s, z0.h, z4.h[2]\n"
+ ".inst 0x647c4013 // bfdot z19.s, z0.h, z4.h[3]\n"
+ ".inst 0x64634016 // bfdot z22.s, z0.h, z3.h[0]\n"
+ ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n"
+ ".inst 0x6473401c // bfdot z28.s, z0.h, z3.h[2]\n"
+ ".inst 0x647b401f // bfdot z31.s, z0.h, z3.h[3]\n"
"5:" // multiply loop done
"st1w { z8.s }, p0, [%x[Cpanel]]\n"
"subs x23, x23, #0x1\n"
@@ -243,4 +247,4 @@ void sve_interleaved_bf16fp32_dot_8x3VL(
}
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE