aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp232
1 files changed, 4 insertions, 228 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
index cd01411722..489b381624 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -115,19 +115,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1rqb z1.b, p6/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1rqb z2.b, p6/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1rqb z3.b, p6/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1rqb z4.b, p6/z, [a_ptr4]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"ld1rqb z5.b, p6/z, [a_ptr5]\n"
"ld1rqb z6.b, p6/z, [a_ptr6]\n"
"ld1rqb z7.b, p6/z, [a_ptr7]\n"
@@ -185,15 +179,7 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
@@ -339,17 +325,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1rqb z1.b, p6/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1rqb z2.b, p6/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1rqb z3.b, p6/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1rqb z4.b, p6/z, [a_ptr4]\n"
"ld1rqb z5.b, p6/z, [a_ptr5]\n"
"ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -419,21 +399,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- "sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"b.ne 4b\n"
@@ -598,17 +570,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1rqb z1.b, p6/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1rqb z2.b, p6/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1rqb z3.b, p6/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1rqb z4.b, p6/z, [a_ptr4]\n"
"ld1rqb z5.b, p6/z, [a_ptr5]\n"
"ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -689,21 +655,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z16.b, z6.b[0]\n"
"addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
@@ -892,17 +850,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1rqb z1.b, p6/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1rqb z2.b, p6/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1rqb z3.b, p6/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1rqb z4.b, p6/z, [a_ptr4]\n"
"ld1rqb z5.b, p6/z, [a_ptr5]\n"
"ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -993,21 +945,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z16.b, z6.b[0]\n"
"addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
@@ -1221,17 +1165,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"addvl %[b_ptr0], %[b_ptr0], #5\n"
"cbz %[loops], 2f\n"
"mov z24.s, #0\n"
@@ -1312,7 +1250,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
@@ -1350,19 +1287,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
@@ -1641,17 +1571,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"addvl %[b_ptr0], %[b_ptr0], #6\n"
"cbz %[loops], 2f\n"
@@ -1741,7 +1665,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
@@ -1753,7 +1676,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z27.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
@@ -1781,17 +1703,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
@@ -2096,17 +2012,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"addvl %[b_ptr0], %[b_ptr0], #7\n"
@@ -2205,13 +2115,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -2245,17 +2153,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -2586,17 +2488,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -2704,13 +2600,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -2722,7 +2616,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -2746,15 +2639,10 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
@@ -3111,17 +2999,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -3247,13 +3129,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -3265,7 +3145,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -3289,19 +3168,14 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
@@ -3708,17 +3582,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -3853,13 +3721,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -3871,7 +3737,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -3895,17 +3760,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -4341,17 +4201,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -4495,13 +4349,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -4513,7 +4365,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -4537,17 +4388,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -5010,17 +4856,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -5173,13 +5013,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -5191,7 +5029,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -5215,17 +5052,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -5715,17 +5547,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -5895,13 +5721,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -5913,7 +5737,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -5937,17 +5760,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -6488,17 +6306,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -6677,13 +6489,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -6695,7 +6505,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -6719,17 +6528,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -7297,17 +7101,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -7495,13 +7293,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -7513,7 +7309,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -7537,17 +7332,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -8143,17 +7933,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"ptrue p7.b\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
"whilelt p0.s, %[temp], %[last_width]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -8350,13 +8134,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
@@ -8368,7 +8150,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
@@ -8392,17 +8173,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -8968,4 +8744,4 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
} // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE