aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
diff options
context:
space:
mode:
authorDavid Mansell <David.Mansell@arm.com>2022-09-23 09:57:43 +0100
committerPablo Marquez Tello <pablo.tello@arm.com>2022-09-23 17:20:25 +0000
commitce79ac6297e6eb2407abd24846b8504dee43770f (patch)
treef78e9b3c0df81f5262084bf197b961bbe17efb54 /src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
parentead4d1101ca51a6862e6e52c0175864139710f2b (diff)
downloadComputeLibrary-ce79ac6297e6eb2407abd24846b8504dee43770f.tar.gz
CPU GEMM: Fix overreads in SVE merges.
SVE merges for interleaved kernels were not guarding bias reads with the correct predicates, leading to overreads and crashes in some cases. Fix to use the appropriate predicate. Resolves: COMPMID-5627 Change-Id: Ib049531c4a3bea56e90623b7b9f0d6a7ab4db2c8 Signed-off-by: David Mansell <David.Mansell@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8315 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp340
1 files changed, 170 insertions, 170 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
index c009881254..115ba59459 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
- "incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z13.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "add z13.s, z13.s, z2.s\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
"whilelt p2.s, %[p], %[w]\n"
- "add z13.s, z13.s, z2.s\n"
"add z14.s, z14.s, z3.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "add z15.s, z15.s, z4.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
: [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"add z14.s, z14.s, z3.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "add z17.s, z17.s, z3.s\n"
"ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
"add z15.s, z15.s, z4.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z16.s, z16.s, z2.s\n"
- "add z17.s, z17.s, z3.s\n"
"add z18.s, z18.s, z4.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
"st1w z16.s, p0, [%[outptr1]]\n"
@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "add z19.s, z19.s, z2.s\n"
- "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
+ "add z18.s, z18.s, z4.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "addvl %[outptr0], %[outptr0], #3\n"
+ "add z19.s, z19.s, z2.s\n"
+ "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
"add z20.s, z20.s, z3.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
"st1w z16.s, p0, [%[outptr1]]\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "addvl %[inptr], %[inptr], #24\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"add z16.s, z16.s, z4.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"add z18.s, z18.s, z3.s\n"
+ "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "add z19.s, z19.s, z4.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "add z19.s, z19.s, z4.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"add z18.s, z18.s, z3.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"add z19.s, z19.s, z4.s\n"
+ "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"add z18.s, z18.s, z3.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
"add z19.s, z19.s, z4.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"