From 74921eee924625426429044decefe3673561b174 Mon Sep 17 00:00:00 2001 From: Michael Tyler Date: Wed, 12 Apr 2023 17:43:17 +0100 Subject: Update CPU kernel implementations and guard directives Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp | 122 +++++++++++---------- 1 file changed, 63 insertions(+), 59 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp') diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp index 94452929c6..e507bc5551 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_bf16fp32_dot_8x3VL( - const bfloat16 *Apanel, const bfloat16 *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *Apanel, + const bfloat16 *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,10 +89,10 @@ void sve_interleaved_bf16fp32_dot_8x3VL( "3:" // main loop head ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" ".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n" - "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n" ".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n" ".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n" - "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" ".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n" "sub x20, x20, #0x2\n" @@ -115,35 +119,35 @@ void sve_interleaved_bf16fp32_dot_8x3VL( ".inst 0x646940d9 // bfdot z25.s, z6.h, z1.h[1]\n" ".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n" ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p0/Z, [x22, #5, MUL VL]\n" + "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n" "addvl x22, x22, #6\n" - ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n" - ".inst 0x646a408b // bfdot z11.s, z4.h, z2.h[1]\n" + ".inst 0x64634088 // bfdot z8.s, z4.h, z3.h[0]\n" + ".inst 0x646b408b // bfdot z11.s, z4.h, z3.h[1]\n" "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" - ".inst 0x6472408e // bfdot z14.s, z4.h, z2.h[2]\n" - ".inst 0x647a4091 // bfdot z17.s, z4.h, z2.h[3]\n" - ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n" - ".inst 0x646b4097 // bfdot z23.s, z4.h, z3.h[1]\n" - ".inst 0x6473409a // bfdot z26.s, z4.h, z3.h[2]\n" - ".inst 0x647b409d // bfdot z29.s, z4.h, z3.h[3]\n" + ".inst 0x6473408e // bfdot z14.s, z4.h, z3.h[2]\n" + ".inst 0x647b4091 // bfdot z17.s, z4.h, z3.h[3]\n" + ".inst 0x64674094 // bfdot z20.s, z4.h, z7.h[0]\n" + ".inst 0x646f4097 // bfdot z23.s, z4.h, z7.h[1]\n" + ".inst 0x6477409a // bfdot z26.s, z4.h, z7.h[2]\n" + ".inst 0x647f409d // bfdot z29.s, z4.h, z7.h[3]\n" "ld1h { z4.h }, p0/Z, [x22]\n" - ".inst 0x646240a9 // bfdot z9.s, z5.h, z2.h[0]\n" - ".inst 0x646a40ac // bfdot z12.s, z5.h, z2.h[1]\n" - ".inst 0x647240af // bfdot z15.s, z5.h, z2.h[2]\n" - ".inst 0x647a40b2 // bfdot z18.s, z5.h, z2.h[3]\n" - ".inst 0x646340b5 // bfdot z21.s, z5.h, z3.h[0]\n" - ".inst 0x646b40b8 // bfdot z24.s, z5.h, z3.h[1]\n" - ".inst 0x647340bb // bfdot z27.s, z5.h, z3.h[2]\n" - ".inst 0x647b40be // bfdot z30.s, z5.h, z3.h[3]\n" + ".inst 0x646340a9 // bfdot z9.s, z5.h, z3.h[0]\n" + ".inst 0x646b40ac // bfdot z12.s, z5.h, z3.h[1]\n" + ".inst 0x647340af // bfdot z15.s, z5.h, z3.h[2]\n" + ".inst 0x647b40b2 // bfdot z18.s, z5.h, z3.h[3]\n" + ".inst 0x646740b5 // bfdot z21.s, z5.h, z7.h[0]\n" + ".inst 0x646f40b8 // bfdot z24.s, z5.h, z7.h[1]\n" + ".inst 0x647740bb // bfdot z27.s, z5.h, z7.h[2]\n" + ".inst 0x647f40be // bfdot z30.s, z5.h, z7.h[3]\n" "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x646240ca // bfdot z10.s, z6.h, z2.h[0]\n" - ".inst 0x646a40cd // bfdot z13.s, z6.h, z2.h[1]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646b40d9 // bfdot z25.s, z6.h, z3.h[1]\n" - ".inst 0x647340dc // bfdot z28.s, z6.h, z3.h[2]\n" - ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n" + ".inst 0x6463404a // bfdot z10.s, z2.h, z3.h[0]\n" + ".inst 0x646b404d // bfdot z13.s, z2.h, z3.h[1]\n" + ".inst 0x64734050 // bfdot z16.s, z2.h, z3.h[2]\n" + ".inst 0x647b4053 // bfdot z19.s, z2.h, z3.h[3]\n" + ".inst 0x64674056 // bfdot z22.s, z2.h, z7.h[0]\n" + ".inst 0x646f4059 // bfdot z25.s, z2.h, z7.h[1]\n" + ".inst 0x6477405c // bfdot z28.s, z2.h, z7.h[2]\n" + ".inst 0x647f405f // bfdot z31.s, z2.h, z7.h[3]\n" "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n" "bge 3b\n" "4:" // main loop skip @@ -174,37 +178,37 @@ void sve_interleaved_bf16fp32_dot_8x3VL( ".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n" ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n" "cbz x20, 5f\n" - "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n" + "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ld1h { z7.h }, p0/Z, [x22]\n" - "ld1h { z4.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n" - "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x647040ee // bfdot z14.s, z7.h, z0.h[2]\n" - ".inst 0x647840f1 // bfdot z17.s, z7.h, z0.h[3]\n" - ".inst 0x646140f4 // bfdot z20.s, z7.h, z1.h[0]\n" + "ld1h { z2.h }, p0/Z, [x22]\n" + "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x64644048 // bfdot z8.s, z2.h, z4.h[0]\n" + "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n" + ".inst 0x646c404b // bfdot z11.s, z2.h, z4.h[1]\n" + ".inst 0x6474404e // bfdot z14.s, z2.h, z4.h[2]\n" + ".inst 0x647c4051 // bfdot z17.s, z2.h, z4.h[3]\n" + ".inst 0x64634054 // bfdot z20.s, z2.h, z3.h[0]\n" "addvl x22, x22, #3\n" - ".inst 0x646940f7 // bfdot z23.s, z7.h, z1.h[1]\n" - ".inst 0x647140fa // bfdot z26.s, z7.h, z1.h[2]\n" - ".inst 0x647940fd // bfdot z29.s, z7.h, z1.h[3]\n" - ".inst 0x64604089 // bfdot z9.s, z4.h, z0.h[0]\n" - ".inst 0x6468408c // bfdot z12.s, z4.h, z0.h[1]\n" - ".inst 0x6470408f // bfdot z15.s, z4.h, z0.h[2]\n" - ".inst 0x64784092 // bfdot z18.s, z4.h, z0.h[3]\n" - ".inst 0x64614095 // bfdot z21.s, z4.h, z1.h[0]\n" - ".inst 0x64694098 // bfdot z24.s, z4.h, z1.h[1]\n" - ".inst 0x6471409b // bfdot z27.s, z4.h, z1.h[2]\n" - ".inst 0x6479409e // bfdot z30.s, z4.h, z1.h[3]\n" - ".inst 0x646040aa // bfdot z10.s, z5.h, z0.h[0]\n" - ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n" - ".inst 0x647040b0 // bfdot z16.s, z5.h, z0.h[2]\n" - ".inst 0x647840b3 // bfdot z19.s, z5.h, z0.h[3]\n" - ".inst 0x646140b6 // bfdot z22.s, z5.h, z1.h[0]\n" - ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n" - ".inst 0x647140bc // bfdot z28.s, z5.h, z1.h[2]\n" - ".inst 0x647940bf // bfdot z31.s, z5.h, z1.h[3]\n" + ".inst 0x646b4057 // bfdot z23.s, z2.h, z3.h[1]\n" + ".inst 0x6473405a // bfdot z26.s, z2.h, z3.h[2]\n" + ".inst 0x647b405d // bfdot z29.s, z2.h, z3.h[3]\n" + ".inst 0x64644029 // bfdot z9.s, z1.h, z4.h[0]\n" + ".inst 0x646c402c // bfdot z12.s, z1.h, z4.h[1]\n" + ".inst 0x6474402f // bfdot z15.s, z1.h, z4.h[2]\n" + ".inst 0x647c4032 // bfdot z18.s, z1.h, z4.h[3]\n" + ".inst 0x64634035 // bfdot z21.s, z1.h, z3.h[0]\n" + ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n" + ".inst 0x6473403b // bfdot z27.s, z1.h, z3.h[2]\n" + ".inst 0x647b403e // bfdot z30.s, z1.h, z3.h[3]\n" + ".inst 0x6464400a // bfdot z10.s, z0.h, z4.h[0]\n" + ".inst 0x646c400d // bfdot z13.s, z0.h, z4.h[1]\n" + ".inst 0x64744010 // bfdot z16.s, z0.h, z4.h[2]\n" + ".inst 0x647c4013 // bfdot z19.s, z0.h, z4.h[3]\n" + ".inst 0x64634016 // bfdot z22.s, z0.h, z3.h[0]\n" + ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n" + ".inst 0x6473401c // bfdot z28.s, z0.h, z3.h[2]\n" + ".inst 0x647b401f // bfdot z31.s, z0.h, z3.h[3]\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" @@ -243,4 +247,4 @@ void sve_interleaved_bf16fp32_dot_8x3VL( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE -- cgit v1.2.1