From 74921eee924625426429044decefe3673561b174 Mon Sep 17 00:00:00 2001 From: Michael Tyler Date: Wed, 12 Apr 2023 17:43:17 +0100 Subject: Update CPU kernel implementations and guard directives Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../sme_interleave2VL_block4_u8_u8_summing.hpp | 102 ++++++++++----------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp index e7571f7da7..7805152656 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 4, VLType::SME, true>( @@ -140,23 +140,23 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" - "udot z19.s, z17.b, z20.b\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" - ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" + ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #4\n" "add x13, x13, #0x8\n" "blt 5b\n" @@ -172,28 +172,28 @@ void interleave_block<2, 4, VLType::SME, true>( "add x25, %x[in], x16, LSL #3\n" "ldr x24, [x26, #0x0]\n" ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" - "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "udot z18.s, z17.b, z20.b\n" "ldr x23, [x25, #0x0]\n" ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" "ldr x22, [x26, #0x8]\n" ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" + ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" "whilelt p9.b, x15, %x[width]\n" ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" "incb x15\n" "add x26, x26, #0x10\n" - "udot z19.s, z17.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "incb x28\n" ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" @@ -217,23 +217,23 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" - ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" - "udot z19.s, z17.b, z20.b\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" - ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" + ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #4\n" "add x13, x13, #0x8\n" "blt 7b\n" @@ -249,28 +249,28 @@ void interleave_block<2, 4, VLType::SME, true>( "add x25, %x[in], x16, LSL #3\n" "ldr x24, [x26, #0x0]\n" ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n" - ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" - "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + "udot z18.s, z17.b, z20.b\n" "ldr x23, [x25, #0x0]\n" ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" "ldr x22, [x26, #0x8]\n" ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" + ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" "whilelt p9.b, x15, %x[width]\n" ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" "subs x20, x20, #0x1\n" "add x26, x26, #0x10\n" - "udot z19.s, z17.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "incb x15\n" ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" @@ -286,19 +286,19 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "ldr x24, [x26, #0x0]\n" + "ldr x21, [x26, #0x0]\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" - "ldr x23, [x26, x16, LSL #0x3]\n" - ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "ldr x20, [x26, x16, LSL #0x3]\n" + ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n" "add x12, x12, #0x1\n" ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" "cmp x12, x16\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" - ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" + ".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n" "add x26, x26, #0x8\n" "addvl x27, x27, #2\n" "add x13, x13, #0x4\n" @@ -311,17 +311,17 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #2\n" "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -329,13 +329,13 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #2\n" "blt 13b\n" "14:" // K loop: End @@ -350,4 +350,4 @@ void interleave_block<2, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) -- cgit v1.2.1