aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp102
1 files changed, 51 insertions, 51 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
index e7571f7da7..7805152656 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
template <>
void interleave_block<2, 4, VLType::SME, true>(
@@ -140,23 +140,23 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
- "udot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+ ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x9\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #4\n"
"add x13, x13, #0x8\n"
"blt 5b\n"
@@ -172,28 +172,28 @@ void interleave_block<2, 4, VLType::SME, true>(
"add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- "udot z19.s, z17.b, z20.b\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "udot z18.s, z17.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+ ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+ ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n"
"whilelt p9.b, x15, %x[width]\n"
".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"incb x15\n"
"add x26, x26, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
"add x25, x25, #0x10\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"incb x28\n"
".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
@@ -217,23 +217,23 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
- "udot z19.s, z17.b, z20.b\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
- ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+ ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
"add x12, x12, #0x2\n"
"cmp x12, x9\n"
"add x26, x26, #0x10\n"
"add x25, x25, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #4\n"
"add x13, x13, #0x8\n"
"blt 7b\n"
@@ -249,28 +249,28 @@ void interleave_block<2, 4, VLType::SME, true>(
"add x25, %x[in], x16, LSL #3\n"
"ldr x24, [x26, #0x0]\n"
".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n"
- "udot z19.s, z17.b, z20.b\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
+ "udot z18.s, z17.b, z20.b\n"
"ldr x23, [x25, #0x0]\n"
".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n"
"ldr x22, [x26, #0x8]\n"
".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
- ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+ ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n"
".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
"ldr x21, [x25, #0x8]\n"
".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
- ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+ ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n"
"whilelt p9.b, x15, %x[width]\n"
".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
"subs x20, x20, #0x1\n"
"add x26, x26, #0x10\n"
- "udot z19.s, z17.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
"add x25, x25, #0x10\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"incb x15\n"
".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
"addvl x27, x27, #4\n"
@@ -286,19 +286,19 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- "ldr x24, [x26, #0x0]\n"
+ "ldr x21, [x26, #0x0]\n"
".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
- "ldr x23, [x26, x16, LSL #0x3]\n"
- ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
+ "ldr x20, [x26, x16, LSL #0x3]\n"
+ ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
"add x12, x12, #0x1\n"
".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
"cmp x12, x16\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
- ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
+ ".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
"add x26, x26, #0x8\n"
"addvl x27, x27, #2\n"
"add x13, x13, #0x4\n"
@@ -311,17 +311,17 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n"
+ ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n"
".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n"
+ ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #2\n"
"add x20, x20, #0x4\n"
"blt 11b\n"
- "whilelt p9.b, x15, %x[width]\n"
+ "whilelt p8.b, x15, %x[width]\n"
"b 14f\n"
"12:" // K loop: Tails: Odd
"mov x12, #0x0\n"
@@ -329,13 +329,13 @@ void interleave_block<2, 4, VLType::SME, true>(
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n"
- ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n"
+ ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n"
".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
- ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n"
+ ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n"
"add x12, x12, #0x1\n"
"cmp x12, x17\n"
- "udot z19.s, z17.b, z20.b\n"
- "udot z18.s, z16.b, z20.b\n"
+ "udot z19.s, z16.b, z20.b\n"
+ "udot z18.s, z17.b, z20.b\n"
"addvl x27, x27, #2\n"
"blt 13b\n"
"14:" // K loop: End
@@ -350,4 +350,4 @@ void interleave_block<2, 4, VLType::SME, true>(
);
}
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)