aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp149
1 files changed, 43 insertions, 106 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index 413bc65288..fc8bdb50a9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -137,13 +137,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
- "cmp x26, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z10.s, z6.b, z0.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
@@ -178,7 +177,6 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
@@ -217,9 +215,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z10.s, z6.b, z0.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
"10:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 5b\n"
"st1w { z8.s }, p4, [x28]\n"
@@ -296,16 +293,14 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x25, x25, #0x10\n"
"add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "cmp x26, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
@@ -356,9 +351,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z13.s, z7.b, z1.b[0]\n"
@@ -413,10 +406,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z11.s, z7.b, z0.b[3]\n"
"udot z15.s, z7.b, z1.b[3]\n"
"21:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 16b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -511,21 +502,18 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
"add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "cmp x26, #0x10\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
"udot z17.s, z7.b, z2.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
@@ -590,12 +578,9 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "add x23, x23, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"udot z16.s, z6.b, z2.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
@@ -665,11 +650,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z15.s, z7.b, z1.b[3]\n"
"udot z19.s, z7.b, z2.b[3]\n"
"32:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 27b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -781,26 +763,22 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x26, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
"udot z20.s, z6.b, z3.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"udot z21.s, z7.b, z3.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
@@ -879,19 +857,15 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "udot z16.s, z6.b, z2.b[0]\n"
- "add x22, x22, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
"udot z20.s, z6.b, z3.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
"udot z21.s, z7.b, z3.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"addvl x9, x9, #4\n"
@@ -972,12 +946,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z19.s, z7.b, z2.b[3]\n"
"udot z23.s, z7.b, z3.b[3]\n"
"43:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 38b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1106,32 +1076,27 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
"ld1rqb { z4.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
"add x21, x21, #0x10\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x26, #0x10\n"
"udot z24.s, z6.b, z4.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"udot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"udot z25.s, z7.b, z4.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"udot z10.s, z6.b, z0.b[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
"udot z22.s, z6.b, z3.b[0]\n"
@@ -1223,22 +1188,17 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "udot z16.s, z6.b, z2.b[0]\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z4.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "add x21, x21, #0x10\n"
- "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
"udot z20.s, z6.b, z3.b[0]\n"
"udot z24.s, z6.b, z4.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
"udot z21.s, z7.b, z3.b[0]\n"
"udot z25.s, z7.b, z4.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
@@ -1334,13 +1294,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z23.s, z7.b, z3.b[3]\n"
"udot z27.s, z7.b, z4.b[3]\n"
"54:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 49b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1489,37 +1444,31 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
+ "cmp x26, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
+ "add x25, x25, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
+ "add x24, x24, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
"ld1rqb { z4.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x21, x21, #0x10\n"
+ "add x22, x22, #0x10\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
"add x20, x20, #0x10\n"
"udot z24.s, z6.b, z4.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x26, #0x10\n"
"udot z28.s, z6.b, z5.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
- "udot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
"udot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
"udot z25.s, z7.b, z4.b[0]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
"udot z29.s, z7.b, z5.b[0]\n"
"ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"udot z10.s, z6.b, z0.b[0]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
"udot z22.s, z6.b, z3.b[0]\n"
@@ -1625,25 +1574,19 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
"ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
"ld1rqb { z2.b }, p0/Z, [x23]\n"
- "add x24, x24, #0x10\n"
- "udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x22]\n"
- "add x23, x23, #0x10\n"
- "udot z16.s, z6.b, z2.b[0]\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
"ld1rqb { z4.b }, p0/Z, [x21]\n"
- "add x22, x22, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x21, x21, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "add x20, x20, #0x10\n"
- "udot z17.s, z7.b, z2.b[0]\n"
"udot z24.s, z6.b, z4.b[0]\n"
"udot z28.s, z6.b, z5.b[0]\n"
"ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
"udot z21.s, z7.b, z3.b[0]\n"
"udot z25.s, z7.b, z4.b[0]\n"
"udot z29.s, z7.b, z5.b[0]\n"
@@ -1754,14 +1697,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z27.s, z7.b, z4.b[3]\n"
"udot z31.s, z7.b, z5.b[3]\n"
"65:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x25, #0x80]\n"
- "add x27, x27, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x23, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x21, #0x80]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
"cmp x27, x19\n"
"bne 60b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"