diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp | 30 |
1 files changed, 15 insertions, 15 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp index 75bc57a649..9e1b2dca3e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp @@ -43,40 +43,40 @@ void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t "ptrue p4.b\n" "1:" // Main row loop: Head "mov x24, %x[in]\n" - "cmp %x[height], #0x1\n" "add x23, x24, %x[in_stride]\n" - "mov x22, %x[out]\n" + "cmp %x[height], #0x1\n" "add %x[in], x23, %x[in_stride]\n" + "mov x22, %x[out]\n" "csel x23, x23, %x[pad_row], GT\n" "sub %x[height], %x[height], #0x2\n" "mov x21, %x[width]\n" "2:" // Main row loop: Column loop "mov x20, x21\n" - "decw x21, ALL, MUL #8\n" "whilelt p3.h, XZR, x20\n" + "ld1h { z20.h }, p3/Z, [x24]\n" "dech x20\n" "whilelt p2.h, XZR, x20\n" + "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n" "dech x20\n" - "ld1h { z21.h }, p3/Z, [x24]\n" "whilelt p1.h, XZR, x20\n" + "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" "dech x20\n" - "ld1h { z20.h }, p2/Z, [x24, #1, MUL VL]\n" "whilelt p0.h, XZR, x20\n" - "ld1h { z25.h }, p1/Z, [x24, #2, MUL VL]\n" - "cmp x21, #0x0\n" "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z17.h }, p3/Z, [x23]\n" + "decw x21, ALL, MUL #8\n" + "cmp x21, #0x0\n" + "zip1 z23.h, z20.h, z17.h\n" + "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n" "addvl x24, x24, #4\n" - "ld1h { z19.h }, p3/Z, [x23]\n" - "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n" + "zip2 z22.h, z20.h, z17.h\n" + "zip1 z21.h, z19.h, z16.h\n" "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n" + "zip2 z20.h, z19.h, z16.h\n" + "zip1 z19.h, z18.h, z17.h\n" "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" - "zip1 z23.h, z21.h, z19.h\n" - "zip2 z22.h, z21.h, z19.h\n" - "zip1 z21.h, z20.h, z18.h\n" - "zip2 z20.h, z20.h, z18.h\n" - "zip1 z19.h, z25.h, z17.h\n" - "zip2 z18.h, z25.h, z17.h\n" + "zip2 z18.h, z18.h, z17.h\n" "zip1 z17.h, z24.h, z16.h\n" "zip2 z16.h, z24.h, z16.h\n" "st1h { z23.h }, p4, [x22]\n" |