diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp | 232 |
1 files changed, 4 insertions, 228 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp index cd01411722..489b381624 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -115,19 +115,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" + "addvl %[b_ptr0], %[b_ptr0], #1\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" - "addvl %[b_ptr0], %[b_ptr0], #1\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" "ld1rqb z7.b, p6/z, [a_ptr7]\n" @@ -185,15 +179,7 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z31.s, #0\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "b.ne 4b\n" "3:\n" "st1w z24.s, p7, [%[c_ptr0]]\n" @@ -339,17 +325,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" @@ -419,21 +399,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z31.s, #0\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" - "sdot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "sdot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" + "sdot z31.s, z16.b, z7.b[0]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" "b.ne 4b\n" @@ -598,17 +570,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" @@ -689,21 +655,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z30.s, z16.b, z6.b[0]\n" "addvl %[b_ptr0], %[b_ptr0], #3\n" "sdot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "sdot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z30.s, z17.b, z6.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z31.s, z17.b, z7.b[1]\n" "sdot z24.s, z18.b, z0.b[2]\n" "sdot z25.s, z18.b, z1.b[2]\n" @@ -892,17 +850,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1rqb z1.b, p6/z, [a_ptr1]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1rqb z2.b, p6/z, [a_ptr2]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1rqb z3.b, p6/z, [a_ptr3]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1rqb z4.b, p6/z, [a_ptr4]\n" "ld1rqb z5.b, p6/z, [a_ptr5]\n" "ld1rqb z6.b, p6/z, [a_ptr6]\n" @@ -993,21 +945,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z30.s, z16.b, z6.b[0]\n" "addvl %[b_ptr0], %[b_ptr0], #4\n" "sdot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "sdot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z30.s, z17.b, z6.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z31.s, z17.b, z7.b[1]\n" "sdot z24.s, z18.b, z0.b[2]\n" "sdot z25.s, z18.b, z1.b[2]\n" @@ -1221,17 +1165,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "addvl %[b_ptr0], %[b_ptr0], #5\n" "cbz %[loops], 2f\n" "mov z24.s, #0\n" @@ -1312,7 +1250,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" @@ -1350,19 +1287,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z31.s, z16.b, z7.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z30.s, z17.b, z6.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z31.s, z17.b, z7.b[1]\n" "sdot z24.s, z18.b, z0.b[2]\n" "sdot z25.s, z18.b, z1.b[2]\n" @@ -1641,17 +1571,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "addvl %[b_ptr0], %[b_ptr0], #6\n" "cbz %[loops], 2f\n" @@ -1741,7 +1665,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" @@ -1753,7 +1676,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z27.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z28.s, p7, [c_ptr4]\n" "addvl c_ptr4, c_ptr4, #1\n" "mov z28.s, #0\n" @@ -1781,17 +1703,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z31.s, z16.b, z7.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" "sdot z24.s, z18.b, z0.b[2]\n" @@ -2096,17 +2012,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "addvl %[b_ptr0], %[b_ptr0], #7\n" @@ -2205,13 +2115,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -2245,17 +2153,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z30.s, z16.b, z6.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "sdot z24.s, z17.b, z0.b[1]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -2586,17 +2488,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -2704,13 +2600,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -2722,7 +2616,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -2746,15 +2639,10 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" @@ -3111,17 +2999,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -3247,13 +3129,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -3265,7 +3145,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -3289,19 +3168,14 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" "addvl %[b_ptr0], %[b_ptr0], #1\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" "sdot z24.s, z18.b, z0.b[2]\n" @@ -3708,17 +3582,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -3853,13 +3721,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -3871,7 +3737,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -3895,17 +3760,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -4341,17 +4201,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -4495,13 +4349,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -4513,7 +4365,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -4537,17 +4388,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -5010,17 +4856,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -5173,13 +5013,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -5191,7 +5029,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -5215,17 +5052,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -5715,17 +5547,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -5895,13 +5721,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -5913,7 +5737,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -5937,17 +5760,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -6488,17 +6306,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -6677,13 +6489,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -6695,7 +6505,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -6719,17 +6528,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -7297,17 +7101,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -7495,13 +7293,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -7513,7 +7309,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -7537,17 +7332,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -8143,17 +7933,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "ptrue p7.b\n" "whilelt p6.b, %[temp], %[odd_depth]\n" "whilelt p0.s, %[temp], %[last_width]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x40]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x80]\n" "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x100]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x140]\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" - "prfm PLDL1KEEP, [a_ptr7, #0x180]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n" "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n" @@ -8350,13 +8134,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z25.s, #0\n" "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n" "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n" - "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n" "st1w z26.s, p7, [c_ptr2]\n" "addvl c_ptr2, c_ptr2, #1\n" "mov z26.s, #0\n" "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n" "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n" - "prfm PSTL1KEEP, [c_ptr1, #0x40]\n" "st1w z27.s, p7, [c_ptr3]\n" "addvl c_ptr3, c_ptr3, #1\n" "mov z27.s, #0\n" @@ -8368,7 +8150,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "mov z28.s, #0\n" "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" "ld1rqb z1.b, p7/z, [a_ptr1]\n" - "prfm PSTL1KEEP, [c_ptr2, #0x40]\n" "st1w z29.s, p7, [c_ptr5]\n" "addvl c_ptr5, c_ptr5, #1\n" "mov z29.s, #0\n" @@ -8392,17 +8173,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B "sdot z29.s, z16.b, z5.b[0]\n" "addvl c_ptr7, c_ptr7, #1\n" "sdot z30.s, z16.b, z6.b[0]\n" - "prfm PSTL1KEEP, [c_ptr3, #0x40]\n" "sdot z31.s, z16.b, z7.b[0]\n" "ld1b z16.b, p7/z, [%[b_ptr0]]\n" "sdot z25.s, z17.b, z1.b[1]\n" - "prfm PSTL1KEEP, [c_ptr4, #0x40]\n" "sdot z26.s, z17.b, z2.b[1]\n" - "prfm PSTL1KEEP, [c_ptr5, #0x40]\n" "sdot z27.s, z17.b, z3.b[1]\n" - "prfm PSTL1KEEP, [c_ptr6, #0x40]\n" "sdot z28.s, z17.b, z4.b[1]\n" - "prfm PSTL1KEEP, [c_ptr7, #0x40]\n" "sdot z29.s, z17.b, z5.b[1]\n" "sdot z30.s, z17.b, z6.b[1]\n" "sdot z31.s, z17.b, z7.b[1]\n" @@ -8968,4 +8744,4 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __ARM_FEATURE_SVE |