aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorDavid Mansell <David.Mansell@arm.com>2022-09-23 09:57:43 +0100
committerPablo Marquez Tello <pablo.tello@arm.com>2022-09-23 17:20:25 +0000
commitce79ac6297e6eb2407abd24846b8504dee43770f (patch)
treef78e9b3c0df81f5262084bf197b961bbe17efb54 /src/core/NEON/kernels
parentead4d1101ca51a6862e6e52c0175864139710f2b (diff)
downloadComputeLibrary-ce79ac6297e6eb2407abd24846b8504dee43770f.tar.gz
CPU GEMM: Fix overreads in SVE merges.
SVE merges for interleaved kernels were not guarding bias reads with the correct predicates, leading to overreads and crashes in some cases. Fix to use the appropriate predicate. Resolves: COMPMID-5627 Change-Id: Ib049531c4a3bea56e90623b7b9f0d6a7ab4db2c8 Signed-off-by: David Mansell <David.Mansell@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8315 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp600
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp600
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp340
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp340
4 files changed, 940 insertions, 940 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
index 4da32b459c..a211a03697 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1029,25 +1029,25 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
- "inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
"ld1h z13.h, p0/z, [%[inptr]]\n"
+ "inch %[p], all, mul #1\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "fadd z13.h, z13.h, z2.h\n"
"ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
"whilelt p2.h, %[p], %[w]\n"
- "fadd z13.h, z13.h, z2.h\n"
+ "fmin z13.h, p0/m, z13.h, z0.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
"fadd z14.h, z14.h, z3.h\n"
"ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
- "fmin z13.h, p0/m, z13.h, z0.h\n"
+ "fmax z13.h, p0/m, z13.h, z1.h\n"
"fmin z14.h, p1/m, z14.h, z0.h\n"
"fadd z15.h, z15.h, z4.h\n"
- "fmax z13.h, p0/m, z13.h, z1.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
"fmax z14.h, p1/m, z14.h, z1.h\n"
"fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
"st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
: [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -1073,42 +1073,42 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
+ "fadd z13.h, z13.h, z2.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
+ "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
+ "fmin z13.h, p0/m, z13.h, z0.h\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"fadd z14.h, z14.h, z3.h\n"
- "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "fmin z16.h, p0/m, z16.h, z0.h\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "fmax z13.h, p0/m, z13.h, z1.h\n"
"ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fadd z17.h, z17.h, z3.h\n"
"addvl %[inptr], %[inptr], #24\n"
- "fmin z13.h, p0/m, z13.h, z0.h\n"
"fmin z14.h, p1/m, z14.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
"fadd z15.h, z15.h, z4.h\n"
- "fadd z16.h, z16.h, z2.h\n"
- "fmax z13.h, p0/m, z13.h, z1.h\n"
+ "fmax z16.h, p0/m, z16.h, z1.h\n"
+ "fmin z17.h, p1/m, z17.h, z0.h\n"
"fmax z14.h, p1/m, z14.h, z1.h\n"
"fmin z15.h, p2/m, z15.h, z0.h\n"
- "fmin z16.h, p0/m, z16.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
- "fadd z17.h, z17.h, z3.h\n"
"fadd z18.h, z18.h, z4.h\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
- "fmax z16.h, p0/m, z16.h, z1.h\n"
- "fmin z17.h, p1/m, z17.h, z0.h\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
"fmin z18.h, p2/m, z18.h, z0.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
- "fmax z17.h, p1/m, z17.h, z1.h\n"
- "st1h z16.h, p0, [%[outptr1]]\n"
"fmax z18.h, p2/m, z18.h, z1.h\n"
+ "st1h z16.h, p0, [%[outptr1]]\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
@@ -1135,60 +1135,60 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
- "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
"ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.h, p0/m, z13.h, z0.h\n"
- "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.h, z15.h, z4.h\n"
"ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.h, z16.h, z2.h\n"
- "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.h, p1/m, z14.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.h, p0/m, z13.h, z1.h\n"
- "addvl %[inptr], %[inptr], #24\n"
- "fmax z14.h, p1/m, z14.h, z1.h\n"
- "fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
+ "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.h, p1/m, z14.h, z0.h\n"
+ "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.h, z15.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.h, p0/m, z16.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
+ "fmax z14.h, p1/m, z14.h, z1.h\n"
"ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.h, z17.h, z3.h\n"
- "fadd z18.h, z18.h, z4.h\n"
+ "fmin z15.h, p2/m, z15.h, z0.h\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmax z16.h, p0/m, z16.h, z1.h\n"
"st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.h, z17.h, z3.h\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmax z15.h, p2/m, z15.h, z1.h\n"
- "fmax z16.h, p0/m, z16.h, z1.h\n"
+ "fadd z18.h, z18.h, z4.h\n"
"fmin z17.h, p1/m, z17.h, z0.h\n"
- "fmin z18.h, p2/m, z18.h, z0.h\n"
- "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.h, z19.h, z2.h\n"
+ "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
+ "fadd z20.h, z20.h, z3.h\n"
"addvl %[outptr0], %[outptr0], #3\n"
"fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z16.h, p0, [%[outptr1]]\n"
- "fmax z18.h, p2/m, z18.h, z1.h\n"
+ "fmin z18.h, p2/m, z18.h, z0.h\n"
"fmin z19.h, p0/m, z19.h, z0.h\n"
- "fadd z20.h, z20.h, z3.h\n"
+ "fmin z20.h, p1/m, z20.h, z0.h\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
"fadd z13.h, z13.h, z4.h\n"
+ "fmax z18.h, p2/m, z18.h, z1.h\n"
"fmax z19.h, p0/m, z19.h, z1.h\n"
+ "fmax z20.h, p1/m, z20.h, z1.h\n"
+ "fmin z13.h, p2/m, z13.h, z0.h\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
- "fmin z20.h, p1/m, z20.h, z0.h\n"
"addvl %[outptr1], %[outptr1], #3\n"
- "fmin z13.h, p2/m, z13.h, z0.h\n"
- "st1h z19.h, p0, [%[outptr2]]\n"
- "fmax z20.h, p1/m, z20.h, z1.h\n"
"fmax z13.h, p2/m, z13.h, z1.h\n"
+ "st1h z19.h, p0, [%[outptr2]]\n"
"st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
"st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
@@ -1215,75 +1215,75 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
- "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
"ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.h, p0/m, z13.h, z0.h\n"
- "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.h, z15.h, z4.h\n"
"ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.h, z16.h, z2.h\n"
- "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.h, p1/m, z14.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.h, p0/m, z13.h, z1.h\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.h, p1/m, z14.h, z1.h\n"
- "addvl %[inptr], %[inptr], #24\n"
- "fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
+ "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.h, p1/m, z14.h, z0.h\n"
+ "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.h, z15.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.h, p0/m, z16.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
+ "fmax z14.h, p1/m, z14.h, z1.h\n"
"ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.h, z17.h, z3.h\n"
- "fadd z18.h, z18.h, z4.h\n"
+ "fmin z15.h, p2/m, z15.h, z0.h\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmax z16.h, p0/m, z16.h, z1.h\n"
"st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "fadd z17.h, z17.h, z3.h\n"
"ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
- "fmax z16.h, p0/m, z16.h, z1.h\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fadd z18.h, z18.h, z4.h\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z17.h, p1/m, z17.h, z0.h\n"
- "fmin z18.h, p2/m, z18.h, z0.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.h, z19.h, z2.h\n"
"ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
- "fadd z20.h, z20.h, z3.h\n"
+ "fmin z18.h, p2/m, z18.h, z0.h\n"
"addvl %[outptr0], %[outptr0], #3\n"
"fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z16.h, p0, [%[outptr1]]\n"
- "fmax z18.h, p2/m, z18.h, z1.h\n"
- "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
"fmin z19.h, p0/m, z19.h, z0.h\n"
- "fmin z20.h, p1/m, z20.h, z0.h\n"
+ "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
+ "fmax z18.h, p2/m, z18.h, z1.h\n"
+ "fadd z20.h, z20.h, z3.h\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
"fadd z13.h, z13.h, z4.h\n"
- "fadd z14.h, z14.h, z2.h\n"
"fmax z19.h, p0/m, z19.h, z1.h\n"
+ "fadd z14.h, z14.h, z2.h\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
- "fmax z20.h, p1/m, z20.h, z1.h\n"
+ "fmin z20.h, p1/m, z20.h, z0.h\n"
"addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.h, p2/m, z13.h, z0.h\n"
"st1h z19.h, p0, [%[outptr2]]\n"
"fmin z14.h, p0/m, z14.h, z0.h\n"
+ "fmax z20.h, p1/m, z20.h, z1.h\n"
"fadd z15.h, z15.h, z3.h\n"
- "fadd z16.h, z16.h, z4.h\n"
- "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
"fmax z13.h, p2/m, z13.h, z1.h\n"
"fmax z14.h, p0/m, z14.h, z1.h\n"
+ "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
+ "fadd z16.h, z16.h, z4.h\n"
"fmin z15.h, p1/m, z15.h, z0.h\n"
- "fmin z16.h, p2/m, z16.h, z0.h\n"
"st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"fmax z15.h, p1/m, z15.h, z1.h\n"
+ "fmin z16.h, p2/m, z16.h, z0.h\n"
"st1h z14.h, p0, [%[outptr3]]\n"
"fmax z16.h, p2/m, z16.h, z1.h\n"
"st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
@@ -1312,93 +1312,93 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
- "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
"ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.h, p0/m, z13.h, z0.h\n"
- "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.h, z15.h, z4.h\n"
"ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.h, z16.h, z2.h\n"
- "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.h, p1/m, z14.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.h, p0/m, z13.h, z1.h\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.h, p1/m, z14.h, z1.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
+ "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.h, p1/m, z14.h, z0.h\n"
+ "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.h, z15.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.h, p0/m, z16.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
+ "fmax z14.h, p1/m, z14.h, z1.h\n"
"ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.h, z17.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
- "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.h, p2/m, z15.h, z0.h\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.h, p0/m, z16.h, z1.h\n"
+ "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.h, z17.h, z3.h\n"
"ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.h, p1/m, z17.h, z0.h\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.h, z18.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.h, p1/m, z17.h, z0.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.h, z19.h, z2.h\n"
"ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.h, p1/m, z17.h, z1.h\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.h, p2/m, z18.h, z0.h\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z16.h, p0, [%[outptr1]]\n"
"fmin z19.h, p0/m, z19.h, z0.h\n"
"ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
+ "fmax z18.h, p2/m, z18.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"fadd z20.h, z20.h, z3.h\n"
- "fadd z13.h, z13.h, z4.h\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
- "fmax z18.h, p2/m, z18.h, z1.h\n"
- "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
"fmax z19.h, p0/m, z19.h, z1.h\n"
+ "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
+ "fadd z13.h, z13.h, z4.h\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z20.h, p1/m, z20.h, z0.h\n"
- "fmin z13.h, p2/m, z13.h, z0.h\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.h, z14.h, z2.h\n"
"ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
- "fadd z15.h, z15.h, z3.h\n"
+ "fmin z13.h, p2/m, z13.h, z0.h\n"
"addvl %[outptr1], %[outptr1], #3\n"
"fmax z20.h, p1/m, z20.h, z1.h\n"
"st1h z19.h, p0, [%[outptr2]]\n"
- "fmax z13.h, p2/m, z13.h, z1.h\n"
- "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
"fmin z14.h, p0/m, z14.h, z0.h\n"
- "fmin z15.h, p1/m, z15.h, z0.h\n"
+ "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
+ "fmax z13.h, p2/m, z13.h, z1.h\n"
+ "fadd z15.h, z15.h, z3.h\n"
"st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
"fadd z16.h, z16.h, z4.h\n"
- "fadd z17.h, z17.h, z2.h\n"
"fmax z14.h, p0/m, z14.h, z1.h\n"
+ "fadd z17.h, z17.h, z2.h\n"
"st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
- "fmax z15.h, p1/m, z15.h, z1.h\n"
+ "fmin z15.h, p1/m, z15.h, z0.h\n"
"addvl %[outptr2], %[outptr2], #3\n"
"fmin z16.h, p2/m, z16.h, z0.h\n"
"st1h z14.h, p0, [%[outptr3]]\n"
"fmin z17.h, p0/m, z17.h, z0.h\n"
+ "fmax z15.h, p1/m, z15.h, z1.h\n"
"fadd z18.h, z18.h, z3.h\n"
- "fadd z19.h, z19.h, z4.h\n"
- "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
"fmax z16.h, p2/m, z16.h, z1.h\n"
"fmax z17.h, p0/m, z17.h, z1.h\n"
+ "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
+ "fadd z19.h, z19.h, z4.h\n"
"fmin z18.h, p1/m, z18.h, z0.h\n"
- "fmin z19.h, p2/m, z19.h, z0.h\n"
"st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmax z18.h, p1/m, z18.h, z1.h\n"
+ "fmin z19.h, p2/m, z19.h, z0.h\n"
"st1h z17.h, p0, [%[outptr4]]\n"
"fmax z19.h, p2/m, z19.h, z1.h\n"
"st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
@@ -1427,111 +1427,111 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
- "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
"ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.h, p0/m, z13.h, z0.h\n"
- "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.h, z15.h, z4.h\n"
"ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.h, z16.h, z2.h\n"
- "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.h, p1/m, z14.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.h, p0/m, z13.h, z1.h\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.h, p1/m, z14.h, z1.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
+ "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.h, p1/m, z14.h, z0.h\n"
+ "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.h, z15.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.h, p0/m, z16.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
+ "fmax z14.h, p1/m, z14.h, z1.h\n"
"ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.h, z17.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
- "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.h, p2/m, z15.h, z0.h\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.h, p0/m, z16.h, z1.h\n"
+ "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.h, z17.h, z3.h\n"
"ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.h, p1/m, z17.h, z0.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.h, z18.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.h, p1/m, z17.h, z0.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.h, z19.h, z2.h\n"
"ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.h, p1/m, z17.h, z1.h\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.h, p2/m, z18.h, z0.h\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z16.h, p0, [%[outptr1]]\n"
"fmin z19.h, p0/m, z19.h, z0.h\n"
"ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
- "fadd z20.h, z20.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fmax z18.h, p2/m, z18.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fadd z20.h, z20.h, z3.h\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
"fmax z19.h, p0/m, z19.h, z1.h\n"
"ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
- "fmin z20.h, p1/m, z20.h, z0.h\n"
- "addvl %[inptr], %[inptr], #24\n"
"fadd z13.h, z13.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmin z20.h, p1/m, z20.h, z0.h\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.h, z14.h, z2.h\n"
"ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
- "fmax z20.h, p1/m, z20.h, z1.h\n"
- "addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.h, p2/m, z13.h, z0.h\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmax z20.h, p1/m, z20.h, z1.h\n"
"st1h z19.h, p0, [%[outptr2]]\n"
"fmin z14.h, p0/m, z14.h, z0.h\n"
"ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
+ "fmax z13.h, p2/m, z13.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fadd z15.h, z15.h, z3.h\n"
- "fadd z16.h, z16.h, z4.h\n"
"st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
- "fmax z13.h, p2/m, z13.h, z1.h\n"
- "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
"fmax z14.h, p0/m, z14.h, z1.h\n"
+ "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
+ "fadd z16.h, z16.h, z4.h\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z15.h, p1/m, z15.h, z0.h\n"
- "fmin z16.h, p2/m, z16.h, z0.h\n"
"st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
"fadd z17.h, z17.h, z2.h\n"
"ld1h z13.h, p1/z, [x8]\n"
- "fadd z18.h, z18.h, z3.h\n"
+ "fmin z16.h, p2/m, z16.h, z0.h\n"
"addvl %[outptr2], %[outptr2], #3\n"
"fmax z15.h, p1/m, z15.h, z1.h\n"
"st1h z14.h, p0, [%[outptr3]]\n"
- "fmax z16.h, p2/m, z16.h, z1.h\n"
- "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
"fmin z17.h, p0/m, z17.h, z0.h\n"
- "fmin z18.h, p1/m, z18.h, z0.h\n"
+ "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
+ "fmax z16.h, p2/m, z16.h, z1.h\n"
+ "fadd z18.h, z18.h, z3.h\n"
"st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
"fadd z19.h, z19.h, z4.h\n"
- "fadd z20.h, z20.h, z2.h\n"
"fmax z17.h, p0/m, z17.h, z1.h\n"
+ "fadd z20.h, z20.h, z2.h\n"
"st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
- "fmax z18.h, p1/m, z18.h, z1.h\n"
+ "fmin z18.h, p1/m, z18.h, z0.h\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmin z19.h, p2/m, z19.h, z0.h\n"
"st1h z17.h, p0, [%[outptr4]]\n"
"fmin z20.h, p0/m, z20.h, z0.h\n"
+ "fmax z18.h, p1/m, z18.h, z1.h\n"
"fadd z13.h, z13.h, z3.h\n"
- "fadd z14.h, z14.h, z4.h\n"
- "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
"fmax z19.h, p2/m, z19.h, z1.h\n"
"fmax z20.h, p0/m, z20.h, z1.h\n"
+ "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
+ "fadd z14.h, z14.h, z4.h\n"
"fmin z13.h, p1/m, z13.h, z0.h\n"
- "fmin z14.h, p2/m, z14.h, z0.h\n"
"st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
"addvl %[outptr4], %[outptr4], #3\n"
"fmax z13.h, p1/m, z13.h, z1.h\n"
+ "fmin z14.h, p2/m, z14.h, z0.h\n"
"st1h z20.h, p0, [%[outptr5]]\n"
"fmax z14.h, p2/m, z14.h, z1.h\n"
"st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
@@ -1560,129 +1560,129 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
- "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
"ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.h, p0/m, z13.h, z0.h\n"
- "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.h, z15.h, z4.h\n"
"ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.h, z16.h, z2.h\n"
- "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.h, p1/m, z14.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.h, p0/m, z13.h, z1.h\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.h, p1/m, z14.h, z1.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
+ "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.h, p1/m, z14.h, z0.h\n"
+ "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.h, z15.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.h, p0/m, z16.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
+ "fmax z14.h, p1/m, z14.h, z1.h\n"
"ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.h, z17.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
- "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.h, p2/m, z15.h, z0.h\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.h, p0/m, z16.h, z1.h\n"
+ "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.h, z17.h, z3.h\n"
"ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.h, p1/m, z17.h, z0.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.h, z18.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.h, p1/m, z17.h, z0.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.h, z19.h, z2.h\n"
"ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.h, p1/m, z17.h, z1.h\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.h, p2/m, z18.h, z0.h\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z16.h, p0, [%[outptr1]]\n"
"fmin z19.h, p0/m, z19.h, z0.h\n"
"ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
- "fadd z20.h, z20.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fmax z18.h, p2/m, z18.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fadd z20.h, z20.h, z3.h\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
"fmax z19.h, p0/m, z19.h, z1.h\n"
"ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
- "fmin z20.h, p1/m, z20.h, z0.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"fadd z13.h, z13.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmin z20.h, p1/m, z20.h, z0.h\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.h, z14.h, z2.h\n"
"ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
- "fmax z20.h, p1/m, z20.h, z1.h\n"
- "addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.h, p2/m, z13.h, z0.h\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmax z20.h, p1/m, z20.h, z1.h\n"
"st1h z19.h, p0, [%[outptr2]]\n"
"fmin z14.h, p0/m, z14.h, z0.h\n"
"ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
- "fadd z15.h, z15.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"fmax z13.h, p2/m, z13.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fadd z15.h, z15.h, z3.h\n"
"st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
"fmax z14.h, p0/m, z14.h, z1.h\n"
"ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
- "fmin z15.h, p1/m, z15.h, z0.h\n"
- "addvl %[inptr], %[inptr], #24\n"
"fadd z16.h, z16.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmin z15.h, p1/m, z15.h, z0.h\n"
"st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
"fadd z17.h, z17.h, z2.h\n"
"ld1h z13.h, p1/z, [x8]\n"
- "fmax z15.h, p1/m, z15.h, z1.h\n"
- "addvl %[outptr2], %[outptr2], #3\n"
"fmin z16.h, p2/m, z16.h, z0.h\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmax z15.h, p1/m, z15.h, z1.h\n"
"st1h z14.h, p0, [%[outptr3]]\n"
"fmin z17.h, p0/m, z17.h, z0.h\n"
"ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
+ "fmax z16.h, p2/m, z16.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"fadd z18.h, z18.h, z3.h\n"
- "fadd z19.h, z19.h, z4.h\n"
"st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
- "fmax z16.h, p2/m, z16.h, z1.h\n"
- "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
"fmax z17.h, p0/m, z17.h, z1.h\n"
+ "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
+ "fadd z19.h, z19.h, z4.h\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z18.h, p1/m, z18.h, z0.h\n"
- "fmin z19.h, p2/m, z19.h, z0.h\n"
"st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
"fadd z20.h, z20.h, z2.h\n"
"ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
- "fadd z13.h, z13.h, z3.h\n"
+ "fmin z19.h, p2/m, z19.h, z0.h\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmax z18.h, p1/m, z18.h, z1.h\n"
"st1h z17.h, p0, [%[outptr4]]\n"
- "fmax z19.h, p2/m, z19.h, z1.h\n"
- "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
"fmin z20.h, p0/m, z20.h, z0.h\n"
- "fmin z13.h, p1/m, z13.h, z0.h\n"
+ "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
+ "fmax z19.h, p2/m, z19.h, z1.h\n"
+ "fadd z13.h, z13.h, z3.h\n"
"st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
"fadd z14.h, z14.h, z4.h\n"
- "fadd z15.h, z15.h, z2.h\n"
"fmax z20.h, p0/m, z20.h, z1.h\n"
+ "fadd z15.h, z15.h, z2.h\n"
"st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
- "fmax z13.h, p1/m, z13.h, z1.h\n"
+ "fmin z13.h, p1/m, z13.h, z0.h\n"
"addvl %[outptr4], %[outptr4], #3\n"
"fmin z14.h, p2/m, z14.h, z0.h\n"
"st1h z20.h, p0, [%[outptr5]]\n"
"fmin z15.h, p0/m, z15.h, z0.h\n"
+ "fmax z13.h, p1/m, z13.h, z1.h\n"
"fadd z16.h, z16.h, z3.h\n"
- "fadd z17.h, z17.h, z4.h\n"
- "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
"fmax z14.h, p2/m, z14.h, z1.h\n"
"fmax z15.h, p0/m, z15.h, z1.h\n"
+ "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
+ "fadd z17.h, z17.h, z4.h\n"
"fmin z16.h, p1/m, z16.h, z0.h\n"
- "fmin z17.h, p2/m, z17.h, z0.h\n"
"st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
"addvl %[outptr5], %[outptr5], #3\n"
"fmax z16.h, p1/m, z16.h, z1.h\n"
+ "fmin z17.h, p2/m, z17.h, z0.h\n"
"st1h z15.h, p0, [%[outptr6]]\n"
"fmax z17.h, p2/m, z17.h, z1.h\n"
"st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
@@ -1712,142 +1712,142 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1h z2.h, p0/z, [%[biasptr]]\n"
"whilelt p1.h, %[p], %[w]\n"
- "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z13.h, p0/z, [%[inptr]]\n"
"inch %[p], all, mul #1\n"
- "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1h z13.h, p0/z, [%[inptr]]\n"
- "whilelt p2.h, %[p], %[w]\n"
- "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.h, z13.h, z2.h\n"
- "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.h, %[p], %[w]\n"
+ "fadd z16.h, z16.h, z2.h\n"
"ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.h, p0/m, z13.h, z0.h\n"
- "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.h, z15.h, z4.h\n"
"ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.h, z16.h, z2.h\n"
- "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.h, p1/m, z14.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.h, z14.h, z3.h\n"
+ "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.h, p0/m, z13.h, z1.h\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.h, p1/m, z14.h, z1.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.h, p2/m, z15.h, z0.h\n"
- "st1h z13.h, p0, [%[outptr0]]\n"
+ "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.h, p1/m, z14.h, z0.h\n"
+ "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.h, z15.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.h, p0/m, z16.h, z0.h\n"
+ "st1h z13.h, p0, [%[outptr0]]\n"
+ "fmax z14.h, p1/m, z14.h, z1.h\n"
"ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.h, z17.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.h, p2/m, z15.h, z1.h\n"
- "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.h, p2/m, z15.h, z0.h\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.h, p0/m, z16.h, z1.h\n"
+ "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.h, z17.h, z3.h\n"
"ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.h, p1/m, z17.h, z0.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmax z15.h, p2/m, z15.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.h, z18.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.h, p1/m, z17.h, z0.h\n"
"st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.h, z19.h, z2.h\n"
"ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.h, p1/m, z17.h, z1.h\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.h, p2/m, z18.h, z0.h\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.h, p1/m, z17.h, z1.h\n"
"st1h z16.h, p0, [%[outptr1]]\n"
"fmin z19.h, p0/m, z19.h, z0.h\n"
"ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
- "fadd z20.h, z20.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fmax z18.h, p2/m, z18.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fadd z20.h, z20.h, z3.h\n"
"st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
"fmax z19.h, p0/m, z19.h, z1.h\n"
"ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
- "fmin z20.h, p1/m, z20.h, z0.h\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"fadd z13.h, z13.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmin z20.h, p1/m, z20.h, z0.h\n"
"st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.h, z14.h, z2.h\n"
"ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
- "fmax z20.h, p1/m, z20.h, z1.h\n"
- "addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.h, p2/m, z13.h, z0.h\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmax z20.h, p1/m, z20.h, z1.h\n"
"st1h z19.h, p0, [%[outptr2]]\n"
"fmin z14.h, p0/m, z14.h, z0.h\n"
"ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
- "fadd z15.h, z15.h, z3.h\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"fmax z13.h, p2/m, z13.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fadd z15.h, z15.h, z3.h\n"
"st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
"fmax z14.h, p0/m, z14.h, z1.h\n"
"ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
- "fmin z15.h, p1/m, z15.h, z0.h\n"
- "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
"fadd z16.h, z16.h, z4.h\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmin z15.h, p1/m, z15.h, z0.h\n"
"st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
"fadd z17.h, z17.h, z2.h\n"
"ld1h z13.h, p1/z, [x8]\n"
- "fmax z15.h, p1/m, z15.h, z1.h\n"
- "addvl %[outptr2], %[outptr2], #3\n"
"fmin z16.h, p2/m, z16.h, z0.h\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmax z15.h, p1/m, z15.h, z1.h\n"
"st1h z14.h, p0, [%[outptr3]]\n"
"fmin z17.h, p0/m, z17.h, z0.h\n"
"ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
- "fadd z18.h, z18.h, z3.h\n"
- "addvl %[inptr], %[inptr], #24\n"
"fmax z16.h, p2/m, z16.h, z1.h\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "fadd z18.h, z18.h, z3.h\n"
"st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
"fmax z17.h, p0/m, z17.h, z1.h\n"
"ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
- "fmin z18.h, p1/m, z18.h, z0.h\n"
"fadd z19.h, z19.h, z4.h\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "fmin z18.h, p1/m, z18.h, z0.h\n"
"st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
"fadd z20.h, z20.h, z2.h\n"
"ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
- "fadd z13.h, z13.h, z3.h\n"
+ "fmin z19.h, p2/m, z19.h, z0.h\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmax z18.h, p1/m, z18.h, z1.h\n"
"st1h z17.h, p0, [%[outptr4]]\n"
- "fmin z19.h, p2/m, z19.h, z0.h\n"
- "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
"fmin z20.h, p0/m, z20.h, z0.h\n"
- "fmin z13.h, p1/m, z13.h, z0.h\n"
- "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
- "fadd z14.h, z14.h, z4.h\n"
- "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
+ "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
"fmax z19.h, p2/m, z19.h, z1.h\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ "fadd z13.h, z13.h, z3.h\n"
+ "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
"fmax z20.h, p0/m, z20.h, z1.h\n"
- "fmax z13.h, p1/m, z13.h, z1.h\n"
- "fmin z14.h, p2/m, z14.h, z0.h\n"
- "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
+ "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
+ "fadd z14.h, z14.h, z4.h\n"
"fadd z15.h, z15.h, z2.h\n"
+ "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
+ "fmin z13.h, p1/m, z13.h, z0.h\n"
"ld1h z19.h, p1/z, [x8, #6, MUL VL]\n"
"fadd z16.h, z16.h, z3.h\n"
"addvl %[outptr4], %[outptr4], #3\n"
- "fmax z14.h, p2/m, z14.h, z1.h\n"
+ "fmin z14.h, p2/m, z14.h, z0.h\n"
"st1h z20.h, p0, [%[outptr5]]\n"
- "fmin z15.h, p0/m, z15.h, z0.h\n"
+ "fmax z13.h, p1/m, z13.h, z1.h\n"
"ld1h z20.h, p2/z, [x8, #7, MUL VL]\n"
+ "fmin z15.h, p0/m, z15.h, z0.h\n"
"fmin z16.h, p1/m, z16.h, z0.h\n"
- "fadd z17.h, z17.h, z4.h\n"
+ "fmax z14.h, p2/m, z14.h, z1.h\n"
"st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
- "fadd z18.h, z18.h, z2.h\n"
+ "fadd z17.h, z17.h, z4.h\n"
"fmax z15.h, p0/m, z15.h, z1.h\n"
"fmax z16.h, p1/m, z16.h, z1.h\n"
"st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
- "fmin z17.h, p2/m, z17.h, z0.h\n"
+ "fadd z18.h, z18.h, z2.h\n"
"addvl %[outptr5], %[outptr5], #3\n"
- "fmin z18.h, p0/m, z18.h, z0.h\n"
+ "fmin z17.h, p2/m, z17.h, z0.h\n"
"st1h z15.h, p0, [%[outptr6]]\n"
"fadd z19.h, z19.h, z3.h\n"
- "fmax z17.h, p2/m, z17.h, z1.h\n"
+ "fmin z18.h, p0/m, z18.h, z0.h\n"
"fadd z20.h, z20.h, z4.h\n"
"st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
- "fmax z18.h, p0/m, z18.h, z1.h\n"
+ "fmax z17.h, p2/m, z17.h, z1.h\n"
"fmin z19.h, p1/m, z19.h, z0.h\n"
+ "fmax z18.h, p0/m, z18.h, z1.h\n"
"fmin z20.h, p2/m, z20.h, z0.h\n"
"st1h z17.h, p2, [%[outptr6], #2, MUL VL]\n"
"addvl %[outptr6], %[outptr6], #3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
index 5505f1efe4..2da48922e3 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1029,25 +1029,25 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
- "incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z13.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "fadd z13.s, z13.s, z2.s\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
"whilelt p2.s, %[p], %[w]\n"
- "fadd z13.s, z13.s, z2.s\n"
+ "fmin z13.s, p0/m, z13.s, z0.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"fadd z14.s, z14.s, z3.s\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
- "fmin z13.s, p0/m, z13.s, z0.s\n"
+ "fmax z13.s, p0/m, z13.s, z1.s\n"
"fmin z14.s, p1/m, z14.s, z0.s\n"
"fadd z15.s, z15.s, z4.s\n"
- "fmax z13.s, p0/m, z13.s, z1.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
"fmax z14.s, p1/m, z14.s, z1.s\n"
"fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
: [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -1073,42 +1073,42 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "fadd z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+ "fmin z13.s, p0/m, z13.s, z0.s\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"fadd z14.s, z14.s, z3.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "fmin z16.s, p0/m, z16.s, z0.s\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "fmax z13.s, p0/m, z13.s, z1.s\n"
"ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fadd z17.s, z17.s, z3.s\n"
"addvl %[inptr], %[inptr], #24\n"
- "fmin z13.s, p0/m, z13.s, z0.s\n"
"fmin z14.s, p1/m, z14.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
"fadd z15.s, z15.s, z4.s\n"
- "fadd z16.s, z16.s, z2.s\n"
- "fmax z13.s, p0/m, z13.s, z1.s\n"
+ "fmax z16.s, p0/m, z16.s, z1.s\n"
+ "fmin z17.s, p1/m, z17.s, z0.s\n"
"fmax z14.s, p1/m, z14.s, z1.s\n"
"fmin z15.s, p2/m, z15.s, z0.s\n"
- "fmin z16.s, p0/m, z16.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "fadd z17.s, z17.s, z3.s\n"
"fadd z18.s, z18.s, z4.s\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "fmax z16.s, p0/m, z16.s, z1.s\n"
- "fmin z17.s, p1/m, z17.s, z0.s\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
"fmin z18.s, p2/m, z18.s, z0.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
- "fmax z17.s, p1/m, z17.s, z1.s\n"
- "st1w z16.s, p0, [%[outptr1]]\n"
"fmax z18.s, p2/m, z18.s, z1.s\n"
+ "st1w z16.s, p0, [%[outptr1]]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
@@ -1135,60 +1135,60 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
- "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
"ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.s, p0/m, z13.s, z0.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.s, z15.s, z4.s\n"
"ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.s, z16.s, z2.s\n"
- "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.s, p1/m, z14.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.s, p0/m, z13.s, z1.s\n"
- "addvl %[inptr], %[inptr], #24\n"
- "fmax z14.s, p1/m, z14.s, z1.s\n"
- "fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.s, p1/m, z14.s, z0.s\n"
+ "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.s, z15.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.s, p0/m, z16.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "fmax z14.s, p1/m, z14.s, z1.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.s, z17.s, z3.s\n"
- "fadd z18.s, z18.s, z4.s\n"
+ "fmin z15.s, p2/m, z15.s, z0.s\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmax z16.s, p0/m, z16.s, z1.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.s, z17.s, z3.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmax z15.s, p2/m, z15.s, z1.s\n"
- "fmax z16.s, p0/m, z16.s, z1.s\n"
+ "fadd z18.s, z18.s, z4.s\n"
"fmin z17.s, p1/m, z17.s, z0.s\n"
- "fmin z18.s, p2/m, z18.s, z0.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.s, z19.s, z2.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
+ "fadd z20.s, z20.s, z3.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
"fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "fmax z18.s, p2/m, z18.s, z1.s\n"
+ "fmin z18.s, p2/m, z18.s, z0.s\n"
"fmin z19.s, p0/m, z19.s, z0.s\n"
- "fadd z20.s, z20.s, z3.s\n"
+ "fmin z20.s, p1/m, z20.s, z0.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"fadd z13.s, z13.s, z4.s\n"
+ "fmax z18.s, p2/m, z18.s, z1.s\n"
"fmax z19.s, p0/m, z19.s, z1.s\n"
+ "fmax z20.s, p1/m, z20.s, z1.s\n"
+ "fmin z13.s, p2/m, z13.s, z0.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
- "fmin z20.s, p1/m, z20.s, z0.s\n"
"addvl %[outptr1], %[outptr1], #3\n"
- "fmin z13.s, p2/m, z13.s, z0.s\n"
- "st1w z19.s, p0, [%[outptr2]]\n"
- "fmax z20.s, p1/m, z20.s, z1.s\n"
"fmax z13.s, p2/m, z13.s, z1.s\n"
+ "st1w z19.s, p0, [%[outptr2]]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
@@ -1215,75 +1215,75 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
- "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
"ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.s, p0/m, z13.s, z0.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.s, z15.s, z4.s\n"
"ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.s, z16.s, z2.s\n"
- "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.s, p1/m, z14.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.s, p0/m, z13.s, z1.s\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.s, p1/m, z14.s, z1.s\n"
- "addvl %[inptr], %[inptr], #24\n"
- "fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.s, p1/m, z14.s, z0.s\n"
+ "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.s, z15.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.s, p0/m, z16.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "fmax z14.s, p1/m, z14.s, z1.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.s, z17.s, z3.s\n"
- "fadd z18.s, z18.s, z4.s\n"
+ "fmin z15.s, p2/m, z15.s, z0.s\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmax z16.s, p0/m, z16.s, z1.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "fadd z17.s, z17.s, z3.s\n"
"ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "fmax z16.s, p0/m, z16.s, z1.s\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fadd z18.s, z18.s, z4.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z17.s, p1/m, z17.s, z0.s\n"
- "fmin z18.s, p2/m, z18.s, z0.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.s, z19.s, z2.s\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "fadd z20.s, z20.s, z3.s\n"
+ "fmin z18.s, p2/m, z18.s, z0.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
"fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "fmax z18.s, p2/m, z18.s, z1.s\n"
- "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
"fmin z19.s, p0/m, z19.s, z0.s\n"
- "fmin z20.s, p1/m, z20.s, z0.s\n"
+ "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+ "fmax z18.s, p2/m, z18.s, z1.s\n"
+ "fadd z20.s, z20.s, z3.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"fadd z13.s, z13.s, z4.s\n"
- "fadd z14.s, z14.s, z2.s\n"
"fmax z19.s, p0/m, z19.s, z1.s\n"
+ "fadd z14.s, z14.s, z2.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
- "fmax z20.s, p1/m, z20.s, z1.s\n"
+ "fmin z20.s, p1/m, z20.s, z0.s\n"
"addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.s, p2/m, z13.s, z0.s\n"
"st1w z19.s, p0, [%[outptr2]]\n"
"fmin z14.s, p0/m, z14.s, z0.s\n"
+ "fmax z20.s, p1/m, z20.s, z1.s\n"
"fadd z15.s, z15.s, z3.s\n"
- "fadd z16.s, z16.s, z4.s\n"
- "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
"fmax z13.s, p2/m, z13.s, z1.s\n"
"fmax z14.s, p0/m, z14.s, z1.s\n"
+ "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
+ "fadd z16.s, z16.s, z4.s\n"
"fmin z15.s, p1/m, z15.s, z0.s\n"
- "fmin z16.s, p2/m, z16.s, z0.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"fmax z15.s, p1/m, z15.s, z1.s\n"
+ "fmin z16.s, p2/m, z16.s, z0.s\n"
"st1w z14.s, p0, [%[outptr3]]\n"
"fmax z16.s, p2/m, z16.s, z1.s\n"
"st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
@@ -1312,93 +1312,93 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
- "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
"ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.s, p0/m, z13.s, z0.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.s, z15.s, z4.s\n"
"ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.s, z16.s, z2.s\n"
- "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.s, p1/m, z14.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.s, p0/m, z13.s, z1.s\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.s, p1/m, z14.s, z1.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.s, p1/m, z14.s, z0.s\n"
+ "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.s, z15.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.s, p0/m, z16.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "fmax z14.s, p1/m, z14.s, z1.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.s, z17.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.s, p2/m, z15.s, z0.s\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.s, p0/m, z16.s, z1.s\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.s, z17.s, z3.s\n"
"ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.s, p1/m, z17.s, z0.s\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.s, z18.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.s, p1/m, z17.s, z0.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.s, z19.s, z2.s\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.s, p1/m, z17.s, z1.s\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.s, p2/m, z18.s, z0.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
"fmin z19.s, p0/m, z19.s, z0.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+ "fmax z18.s, p2/m, z18.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"fadd z20.s, z20.s, z3.s\n"
- "fadd z13.s, z13.s, z4.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "fmax z18.s, p2/m, z18.s, z1.s\n"
- "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
"fmax z19.s, p0/m, z19.s, z1.s\n"
+ "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fadd z13.s, z13.s, z4.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z20.s, p1/m, z20.s, z0.s\n"
- "fmin z13.s, p2/m, z13.s, z0.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.s, z14.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "fadd z15.s, z15.s, z3.s\n"
+ "fmin z13.s, p2/m, z13.s, z0.s\n"
"addvl %[outptr1], %[outptr1], #3\n"
"fmax z20.s, p1/m, z20.s, z1.s\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "fmax z13.s, p2/m, z13.s, z1.s\n"
- "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
"fmin z14.s, p0/m, z14.s, z0.s\n"
- "fmin z15.s, p1/m, z15.s, z0.s\n"
+ "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+ "fmax z13.s, p2/m, z13.s, z1.s\n"
+ "fadd z15.s, z15.s, z3.s\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
"fadd z16.s, z16.s, z4.s\n"
- "fadd z17.s, z17.s, z2.s\n"
"fmax z14.s, p0/m, z14.s, z1.s\n"
+ "fadd z17.s, z17.s, z2.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
- "fmax z15.s, p1/m, z15.s, z1.s\n"
+ "fmin z15.s, p1/m, z15.s, z0.s\n"
"addvl %[outptr2], %[outptr2], #3\n"
"fmin z16.s, p2/m, z16.s, z0.s\n"
"st1w z14.s, p0, [%[outptr3]]\n"
"fmin z17.s, p0/m, z17.s, z0.s\n"
+ "fmax z15.s, p1/m, z15.s, z1.s\n"
"fadd z18.s, z18.s, z3.s\n"
- "fadd z19.s, z19.s, z4.s\n"
- "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
"fmax z16.s, p2/m, z16.s, z1.s\n"
"fmax z17.s, p0/m, z17.s, z1.s\n"
+ "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
+ "fadd z19.s, z19.s, z4.s\n"
"fmin z18.s, p1/m, z18.s, z0.s\n"
- "fmin z19.s, p2/m, z19.s, z0.s\n"
"st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmax z18.s, p1/m, z18.s, z1.s\n"
+ "fmin z19.s, p2/m, z19.s, z0.s\n"
"st1w z17.s, p0, [%[outptr4]]\n"
"fmax z19.s, p2/m, z19.s, z1.s\n"
"st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
@@ -1427,111 +1427,111 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
- "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
"ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.s, p0/m, z13.s, z0.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.s, z15.s, z4.s\n"
"ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.s, z16.s, z2.s\n"
- "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.s, p1/m, z14.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.s, p0/m, z13.s, z1.s\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.s, p1/m, z14.s, z1.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.s, p1/m, z14.s, z0.s\n"
+ "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.s, z15.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.s, p0/m, z16.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "fmax z14.s, p1/m, z14.s, z1.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.s, z17.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.s, p2/m, z15.s, z0.s\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.s, p0/m, z16.s, z1.s\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.s, z17.s, z3.s\n"
"ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.s, p1/m, z17.s, z0.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.s, z18.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.s, p1/m, z17.s, z0.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.s, z19.s, z2.s\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.s, p1/m, z17.s, z1.s\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.s, p2/m, z18.s, z0.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
"fmin z19.s, p0/m, z19.s, z0.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "fadd z20.s, z20.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fmax z18.s, p2/m, z18.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fadd z20.s, z20.s, z3.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"fmax z19.s, p0/m, z19.s, z1.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "fmin z20.s, p1/m, z20.s, z0.s\n"
- "addvl %[inptr], %[inptr], #24\n"
"fadd z13.s, z13.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmin z20.s, p1/m, z20.s, z0.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.s, z14.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "fmax z20.s, p1/m, z20.s, z1.s\n"
- "addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.s, p2/m, z13.s, z0.s\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmax z20.s, p1/m, z20.s, z1.s\n"
"st1w z19.s, p0, [%[outptr2]]\n"
"fmin z14.s, p0/m, z14.s, z0.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+ "fmax z13.s, p2/m, z13.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fadd z15.s, z15.s, z3.s\n"
- "fadd z16.s, z16.s, z4.s\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "fmax z13.s, p2/m, z13.s, z1.s\n"
- "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
"fmax z14.s, p0/m, z14.s, z1.s\n"
+ "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fadd z16.s, z16.s, z4.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z15.s, p1/m, z15.s, z0.s\n"
- "fmin z16.s, p2/m, z16.s, z0.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"fadd z17.s, z17.s, z2.s\n"
"ld1w z13.s, p1/z, [x8]\n"
- "fadd z18.s, z18.s, z3.s\n"
+ "fmin z16.s, p2/m, z16.s, z0.s\n"
"addvl %[outptr2], %[outptr2], #3\n"
"fmax z15.s, p1/m, z15.s, z1.s\n"
"st1w z14.s, p0, [%[outptr3]]\n"
- "fmax z16.s, p2/m, z16.s, z1.s\n"
- "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
"fmin z17.s, p0/m, z17.s, z0.s\n"
- "fmin z18.s, p1/m, z18.s, z0.s\n"
+ "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
+ "fmax z16.s, p2/m, z16.s, z1.s\n"
+ "fadd z18.s, z18.s, z3.s\n"
"st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
"fadd z19.s, z19.s, z4.s\n"
- "fadd z20.s, z20.s, z2.s\n"
"fmax z17.s, p0/m, z17.s, z1.s\n"
+ "fadd z20.s, z20.s, z2.s\n"
"st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
- "fmax z18.s, p1/m, z18.s, z1.s\n"
+ "fmin z18.s, p1/m, z18.s, z0.s\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmin z19.s, p2/m, z19.s, z0.s\n"
"st1w z17.s, p0, [%[outptr4]]\n"
"fmin z20.s, p0/m, z20.s, z0.s\n"
+ "fmax z18.s, p1/m, z18.s, z1.s\n"
"fadd z13.s, z13.s, z3.s\n"
- "fadd z14.s, z14.s, z4.s\n"
- "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
"fmax z19.s, p2/m, z19.s, z1.s\n"
"fmax z20.s, p0/m, z20.s, z1.s\n"
+ "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
+ "fadd z14.s, z14.s, z4.s\n"
"fmin z13.s, p1/m, z13.s, z0.s\n"
- "fmin z14.s, p2/m, z14.s, z0.s\n"
"st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
"addvl %[outptr4], %[outptr4], #3\n"
"fmax z13.s, p1/m, z13.s, z1.s\n"
+ "fmin z14.s, p2/m, z14.s, z0.s\n"
"st1w z20.s, p0, [%[outptr5]]\n"
"fmax z14.s, p2/m, z14.s, z1.s\n"
"st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
@@ -1560,129 +1560,129 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
- "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
"ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.s, p0/m, z13.s, z0.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.s, z15.s, z4.s\n"
"ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.s, z16.s, z2.s\n"
- "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.s, p1/m, z14.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.s, p0/m, z13.s, z1.s\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.s, p1/m, z14.s, z1.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.s, p1/m, z14.s, z0.s\n"
+ "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.s, z15.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.s, p0/m, z16.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "fmax z14.s, p1/m, z14.s, z1.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.s, z17.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.s, p2/m, z15.s, z0.s\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.s, p0/m, z16.s, z1.s\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.s, z17.s, z3.s\n"
"ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.s, p1/m, z17.s, z0.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.s, z18.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.s, p1/m, z17.s, z0.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.s, z19.s, z2.s\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.s, p1/m, z17.s, z1.s\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.s, p2/m, z18.s, z0.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
"fmin z19.s, p0/m, z19.s, z0.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "fadd z20.s, z20.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fmax z18.s, p2/m, z18.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fadd z20.s, z20.s, z3.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"fmax z19.s, p0/m, z19.s, z1.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "fmin z20.s, p1/m, z20.s, z0.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"fadd z13.s, z13.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmin z20.s, p1/m, z20.s, z0.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.s, z14.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "fmax z20.s, p1/m, z20.s, z1.s\n"
- "addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.s, p2/m, z13.s, z0.s\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmax z20.s, p1/m, z20.s, z1.s\n"
"st1w z19.s, p0, [%[outptr2]]\n"
"fmin z14.s, p0/m, z14.s, z0.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "fadd z15.s, z15.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"fmax z13.s, p2/m, z13.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fadd z15.s, z15.s, z3.s\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
"fmax z14.s, p0/m, z14.s, z1.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
- "fmin z15.s, p1/m, z15.s, z0.s\n"
- "addvl %[inptr], %[inptr], #24\n"
"fadd z16.s, z16.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmin z15.s, p1/m, z15.s, z0.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"fadd z17.s, z17.s, z2.s\n"
"ld1w z13.s, p1/z, [x8]\n"
- "fmax z15.s, p1/m, z15.s, z1.s\n"
- "addvl %[outptr2], %[outptr2], #3\n"
"fmin z16.s, p2/m, z16.s, z0.s\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmax z15.s, p1/m, z15.s, z1.s\n"
"st1w z14.s, p0, [%[outptr3]]\n"
"fmin z17.s, p0/m, z17.s, z0.s\n"
"ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
+ "fmax z16.s, p2/m, z16.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"fadd z18.s, z18.s, z3.s\n"
- "fadd z19.s, z19.s, z4.s\n"
"st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
- "fmax z16.s, p2/m, z16.s, z1.s\n"
- "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
"fmax z17.s, p0/m, z17.s, z1.s\n"
+ "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
+ "fadd z19.s, z19.s, z4.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"fmin z18.s, p1/m, z18.s, z0.s\n"
- "fmin z19.s, p2/m, z19.s, z0.s\n"
"st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
"fadd z20.s, z20.s, z2.s\n"
"ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
- "fadd z13.s, z13.s, z3.s\n"
+ "fmin z19.s, p2/m, z19.s, z0.s\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmax z18.s, p1/m, z18.s, z1.s\n"
"st1w z17.s, p0, [%[outptr4]]\n"
- "fmax z19.s, p2/m, z19.s, z1.s\n"
- "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
"fmin z20.s, p0/m, z20.s, z0.s\n"
- "fmin z13.s, p1/m, z13.s, z0.s\n"
+ "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
+ "fmax z19.s, p2/m, z19.s, z1.s\n"
+ "fadd z13.s, z13.s, z3.s\n"
"st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
"fadd z14.s, z14.s, z4.s\n"
- "fadd z15.s, z15.s, z2.s\n"
"fmax z20.s, p0/m, z20.s, z1.s\n"
+ "fadd z15.s, z15.s, z2.s\n"
"st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
- "fmax z13.s, p1/m, z13.s, z1.s\n"
+ "fmin z13.s, p1/m, z13.s, z0.s\n"
"addvl %[outptr4], %[outptr4], #3\n"
"fmin z14.s, p2/m, z14.s, z0.s\n"
"st1w z20.s, p0, [%[outptr5]]\n"
"fmin z15.s, p0/m, z15.s, z0.s\n"
+ "fmax z13.s, p1/m, z13.s, z1.s\n"
"fadd z16.s, z16.s, z3.s\n"
- "fadd z17.s, z17.s, z4.s\n"
- "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
"fmax z14.s, p2/m, z14.s, z1.s\n"
"fmax z15.s, p0/m, z15.s, z1.s\n"
+ "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
+ "fadd z17.s, z17.s, z4.s\n"
"fmin z16.s, p1/m, z16.s, z0.s\n"
- "fmin z17.s, p2/m, z17.s, z0.s\n"
"st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
"addvl %[outptr5], %[outptr5], #3\n"
"fmax z16.s, p1/m, z16.s, z1.s\n"
+ "fmin z17.s, p2/m, z17.s, z0.s\n"
"st1w z15.s, p0, [%[outptr6]]\n"
"fmax z17.s, p2/m, z17.s, z1.s\n"
"st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
@@ -1712,142 +1712,142 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
- "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fadd z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "fadd z16.s, z16.s, z2.s\n"
"ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"fmin z13.s, p0/m, z13.s, z0.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "fadd z15.s, z15.s, z4.s\n"
"ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "fadd z16.s, z16.s, z2.s\n"
- "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "fmin z14.s, p1/m, z14.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fadd z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
"fmax z13.s, p0/m, z13.s, z1.s\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
- "fmax z14.s, p1/m, z14.s, z1.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "fmin z15.s, p2/m, z15.s, z0.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "fmin z14.s, p1/m, z14.s, z0.s\n"
+ "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "fadd z15.s, z15.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
"fmin z16.s, p0/m, z16.s, z0.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "fmax z14.s, p1/m, z14.s, z1.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "fadd z17.s, z17.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
- "fmax z15.s, p2/m, z15.s, z1.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fmin z15.s, p2/m, z15.s, z0.s\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"fmax z16.s, p0/m, z16.s, z1.s\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "fadd z17.s, z17.s, z3.s\n"
"ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "fmin z17.s, p1/m, z17.s, z0.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmax z15.s, p2/m, z15.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"fadd z18.s, z18.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmin z17.s, p1/m, z17.s, z0.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"fadd z19.s, z19.s, z2.s\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "fmax z17.s, p1/m, z17.s, z1.s\n"
- "addvl %[outptr0], %[outptr0], #3\n"
"fmin z18.s, p2/m, z18.s, z0.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmax z17.s, p1/m, z17.s, z1.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
"fmin z19.s, p0/m, z19.s, z0.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "fadd z20.s, z20.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"fmax z18.s, p2/m, z18.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fadd z20.s, z20.s, z3.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"fmax z19.s, p0/m, z19.s, z1.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "fmin z20.s, p1/m, z20.s, z0.s\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"fadd z13.s, z13.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmin z20.s, p1/m, z20.s, z0.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"fadd z14.s, z14.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "fmax z20.s, p1/m, z20.s, z1.s\n"
- "addvl %[outptr1], %[outptr1], #3\n"
"fmin z13.s, p2/m, z13.s, z0.s\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmax z20.s, p1/m, z20.s, z1.s\n"
"st1w z19.s, p0, [%[outptr2]]\n"
"fmin z14.s, p0/m, z14.s, z0.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "fadd z15.s, z15.s, z3.s\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"fmax z13.s, p2/m, z13.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fadd z15.s, z15.s, z3.s\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
"fmax z14.s, p0/m, z14.s, z1.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
- "fmin z15.s, p1/m, z15.s, z0.s\n"
- "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
"fadd z16.s, z16.s, z4.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmin z15.s, p1/m, z15.s, z0.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"fadd z17.s, z17.s, z2.s\n"
"ld1w z13.s, p1/z, [x8]\n"
- "fmax z15.s, p1/m, z15.s, z1.s\n"
- "addvl %[outptr2], %[outptr2], #3\n"
"fmin z16.s, p2/m, z16.s, z0.s\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmax z15.s, p1/m, z15.s, z1.s\n"
"st1w z14.s, p0, [%[outptr3]]\n"
"fmin z17.s, p0/m, z17.s, z0.s\n"
"ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
- "fadd z18.s, z18.s, z3.s\n"
- "addvl %[inptr], %[inptr], #24\n"
"fmax z16.s, p2/m, z16.s, z1.s\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "fadd z18.s, z18.s, z3.s\n"
"st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
"fmax z17.s, p0/m, z17.s, z1.s\n"
"ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
- "fmin z18.s, p1/m, z18.s, z0.s\n"
"fadd z19.s, z19.s, z4.s\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "fmin z18.s, p1/m, z18.s, z0.s\n"
"st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
"fadd z20.s, z20.s, z2.s\n"
"ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
- "fadd z13.s, z13.s, z3.s\n"
+ "fmin z19.s, p2/m, z19.s, z0.s\n"
"addvl %[outptr3], %[outptr3], #3\n"
"fmax z18.s, p1/m, z18.s, z1.s\n"
"st1w z17.s, p0, [%[outptr4]]\n"
- "fmin z19.s, p2/m, z19.s, z0.s\n"
- "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
"fmin z20.s, p0/m, z20.s, z0.s\n"
- "fmin z13.s, p1/m, z13.s, z0.s\n"
- "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
- "fadd z14.s, z14.s, z4.s\n"
- "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
+ "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
"fmax z19.s, p2/m, z19.s, z1.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ "fadd z13.s, z13.s, z3.s\n"
+ "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
"fmax z20.s, p0/m, z20.s, z1.s\n"
- "fmax z13.s, p1/m, z13.s, z1.s\n"
- "fmin z14.s, p2/m, z14.s, z0.s\n"
- "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
+ "fadd z14.s, z14.s, z4.s\n"
"fadd z15.s, z15.s, z2.s\n"
+ "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
+ "fmin z13.s, p1/m, z13.s, z0.s\n"
"ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
"fadd z16.s, z16.s, z3.s\n"
"addvl %[outptr4], %[outptr4], #3\n"
- "fmax z14.s, p2/m, z14.s, z1.s\n"
+ "fmin z14.s, p2/m, z14.s, z0.s\n"
"st1w z20.s, p0, [%[outptr5]]\n"
- "fmin z15.s, p0/m, z15.s, z0.s\n"
+ "fmax z13.s, p1/m, z13.s, z1.s\n"
"ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
+ "fmin z15.s, p0/m, z15.s, z0.s\n"
"fmin z16.s, p1/m, z16.s, z0.s\n"
- "fadd z17.s, z17.s, z4.s\n"
+ "fmax z14.s, p2/m, z14.s, z1.s\n"
"st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
- "fadd z18.s, z18.s, z2.s\n"
+ "fadd z17.s, z17.s, z4.s\n"
"fmax z15.s, p0/m, z15.s, z1.s\n"
"fmax z16.s, p1/m, z16.s, z1.s\n"
"st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
- "fmin z17.s, p2/m, z17.s, z0.s\n"
+ "fadd z18.s, z18.s, z2.s\n"
"addvl %[outptr5], %[outptr5], #3\n"
- "fmin z18.s, p0/m, z18.s, z0.s\n"
+ "fmin z17.s, p2/m, z17.s, z0.s\n"
"st1w z15.s, p0, [%[outptr6]]\n"
"fadd z19.s, z19.s, z3.s\n"
- "fmax z17.s, p2/m, z17.s, z1.s\n"
+ "fmin z18.s, p0/m, z18.s, z0.s\n"
"fadd z20.s, z20.s, z4.s\n"
"st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
- "fmax z18.s, p0/m, z18.s, z1.s\n"
+ "fmax z17.s, p2/m, z17.s, z1.s\n"
"fmin z19.s, p1/m, z19.s, z0.s\n"
+ "fmax z18.s, p0/m, z18.s, z1.s\n"
"fmin z20.s, p2/m, z20.s, z0.s\n"
"st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
"addvl %[outptr6], %[outptr6], #3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
index c009881254..115ba59459 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
- "incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z13.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "add z13.s, z13.s, z2.s\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
"whilelt p2.s, %[p], %[w]\n"
- "add z13.s, z13.s, z2.s\n"
"add z14.s, z14.s, z3.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "add z15.s, z15.s, z4.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
: [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"add z14.s, z14.s, z3.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "add z17.s, z17.s, z3.s\n"
"ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
"add z15.s, z15.s, z4.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z16.s, z16.s, z2.s\n"
- "add z17.s, z17.s, z3.s\n"
"add z18.s, z18.s, z4.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
"st1w z16.s, p0, [%[outptr1]]\n"
@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "add z19.s, z19.s, z2.s\n"
- "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
+ "add z18.s, z18.s, z4.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "addvl %[outptr0], %[outptr0], #3\n"
+ "add z19.s, z19.s, z2.s\n"
+ "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
"add z20.s, z20.s, z3.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
"st1w z16.s, p0, [%[outptr1]]\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "addvl %[inptr], %[inptr], #24\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"add z16.s, z16.s, z4.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"add z18.s, z18.s, z3.s\n"
+ "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "add z19.s, z19.s, z4.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "add z19.s, z19.s, z4.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"add z18.s, z18.s, z3.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"add z19.s, z19.s, z4.s\n"
+ "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"add z18.s, z18.s, z3.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
"add z19.s, z19.s, z4.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
index e992f6722c..358ed79989 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
- "incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z13.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "add z13.s, z13.s, z2.s\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
"whilelt p2.s, %[p], %[w]\n"
- "add z13.s, z13.s, z2.s\n"
"add z14.s, z14.s, z3.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+ "add z15.s, z15.s, z4.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
: [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
- "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"add z14.s, z14.s, z3.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+ "add z17.s, z17.s, z3.s\n"
"ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
+ "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
"add z15.s, z15.s, z4.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z16.s, z16.s, z2.s\n"
- "add z17.s, z17.s, z3.s\n"
"add z18.s, z18.s, z4.s\n"
- "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"addvl %[outptr0], %[outptr0], #3\n"
"st1w z16.s, p0, [%[outptr1]]\n"
@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
- "add z19.s, z19.s, z2.s\n"
- "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
"addvl %[inptr], %[inptr], #24\n"
+ "add z18.s, z18.s, z4.s\n"
"st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "addvl %[outptr0], %[outptr0], #3\n"
+ "add z19.s, z19.s, z2.s\n"
+ "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
"add z20.s, z20.s, z3.s\n"
- "add z13.s, z13.s, z4.s\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
"st1w z16.s, p0, [%[outptr1]]\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "addvl %[inptr], %[inptr], #24\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
"add z16.s, z16.s, z4.s\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"add z18.s, z18.s, z3.s\n"
+ "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "add z19.s, z19.s, z4.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "add z19.s, z19.s, z4.s\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"add z18.s, z18.s, z3.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"add z19.s, z19.s, z4.s\n"
+ "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"
@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
"prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
"ld1w z2.s, p0/z, [%[biasptr]]\n"
"whilelt p1.s, %[p], %[w]\n"
- "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+ "ld1w z13.s, p0/z, [%[inptr]]\n"
"incw %[p], all, mul #1\n"
- "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+ "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
- "ld1w z13.s, p0/z, [%[inptr]]\n"
- "whilelt p2.s, %[p], %[w]\n"
+ "add z13.s, z13.s, z2.s\n"
+ "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
"ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+ "whilelt p2.s, %[p], %[w]\n"
+ "add z16.s, z16.s, z2.s\n"
+ "st1w z13.s, p0, [%[outptr0]]\n"
+ "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
"prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
- "add z13.s, z13.s, z2.s\n"
+ "add z14.s, z14.s, z3.s\n"
+ "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
"ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
- "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
"prfm PLDL1KEEP, [%[inptr], #0x200]\n"
- "add z14.s, z14.s, z3.s\n"
- "st1w z13.s, p0, [%[outptr0]]\n"
- "add z15.s, z15.s, z4.s\n"
- "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
- "add z16.s, z16.s, z2.s\n"
- "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
- "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
"add z17.s, z17.s, z3.s\n"
"st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
- "add z18.s, z18.s, z4.s\n"
+ "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "add z15.s, z15.s, z4.s\n"
+ "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
"ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
"add z19.s, z19.s, z2.s\n"
"ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
- "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
"add z20.s, z20.s, z3.s\n"
- "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
- "add z13.s, z13.s, z4.s\n"
+ "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
"ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
- "add z14.s, z14.s, z2.s\n"
"addvl %[outptr0], %[outptr0], #3\n"
+ "add z13.s, z13.s, z4.s\n"
"st1w z16.s, p0, [%[outptr1]]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
- "add z15.s, z15.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
"ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "add z15.s, z15.s, z3.s\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
"st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
"add z16.s, z16.s, z4.s\n"
"ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
"st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
"addvl %[outptr1], %[outptr1], #3\n"
"add z17.s, z17.s, z2.s\n"
"ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
- "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
"st1w z19.s, p0, [%[outptr2]]\n"
- "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
"add z18.s, z18.s, z3.s\n"
"ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
- "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
"st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
- "addvl %[inptr], %[inptr], #24\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
"add z19.s, z19.s, z4.s\n"
"ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[inptr], %[inptr], #24\n"
"st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
"addvl %[outptr2], %[outptr2], #3\n"
"add z20.s, z20.s, z2.s\n"