aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-01-14 13:43:40 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-01-18 12:39:30 +0000
commit33e03074c36d85de87e9032a2583b04ce8ddcd6b (patch)
tree4442ec5b5022fa5681d689e6ccf3a6423efa8a93 /src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL
parent8d5337ef18901f1b54d0c062ae7486bc5a4c6610 (diff)
downloadComputeLibrary-33e03074c36d85de87e9032a2583b04ce8ddcd6b.tar.gz
Cycle estimate-based kernel selection for dot product quantized s8/u8 kernels
Resolves: COMPMID-3990 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: If840c79209940535450f4ea1cbf6b0ec646a168e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4866 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp62
1 files changed, 18 insertions, 44 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 0a6546b78a..52210dca27 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -226,7 +226,6 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
"cmp x28, x19\n"
"bne 5b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
"tbnz %x[flags], #31, 13f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z1.s }, p2/Z, [x19]\n"
@@ -301,9 +300,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"st1b { z16.b }, p1, [x9]\n"
"addvl x9, x9, #1\n"
"15:" // Height 1: Writeback done
- "mov x19, #0x0\n"
- "incw x19, ALL, MUL #4\n"
- "subs x12, x12, x19\n"
+ "decw x12, ALL, MUL #4\n"
+ "cmp x12, XZR\n"
"bgt 3b\n"
"b 62f\n"
"16:" // Height 2
@@ -498,17 +496,13 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
"cmp x28, x19\n"
"bne 20b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
"tbnz %x[flags], #31, 28f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z2.s }, p2/Z, [x19]\n"
"neg z2.s, p2/M, z2.s\n"
- "mov x20, #0x4\n"
"mov x19, #0x4\n"
- "whilelt p0.s, XZR, x20\n"
- "uaddv d11, p0, z11.s\n"
"whilelt p0.s, XZR, x19\n"
+ "uaddv d11, p0, z11.s\n"
"uaddv d12, p0, z12.s\n"
"mov z11.s, z11.s[0]\n"
"mov z12.s, z12.s[0]\n"
@@ -624,9 +618,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"st1b { z20.b }, p1, [x25]\n"
"addvl x25, x25, #1\n"
"30:" // Height 2: Writeback done
- "mov x19, #0x0\n"
- "incw x19, ALL, MUL #4\n"
- "subs x12, x12, x19\n"
+ "decw x12, ALL, MUL #4\n"
+ "cmp x12, XZR\n"
"bgt 18b\n"
"b 62f\n"
"31:" // Height 3
@@ -871,27 +864,20 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
"cmp x28, x19\n"
"bne 35b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
"tbnz %x[flags], #31, 43f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z3.s }, p2/Z, [x19]\n"
"neg z3.s, p2/M, z3.s\n"
- "mov x20, #0x4\n"
"mov x19, #0x4\n"
- "whilelt p0.s, XZR, x20\n"
- "uaddv d11, p0, z11.s\n"
"whilelt p0.s, XZR, x19\n"
+ "uaddv d11, p0, z11.s\n"
"uaddv d12, p0, z12.s\n"
- "mov x19, #0x4\n"
+ "uaddv d13, p0, z13.s\n"
"mov z11.s, z11.s[0]\n"
- "whilelt p0.s, XZR, x19\n"
"mov z12.s, z12.s[0]\n"
- "uaddv d13, p0, z13.s\n"
+ "mov z13.s, z13.s[0]\n"
"mul z11.s, p2/M, z11.s, z3.s\n"
"mul z12.s, p2/M, z12.s, z3.s\n"
- "mov z13.s, z13.s[0]\n"
"mul z13.s, p2/M, z13.s, z3.s\n"
"43:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
@@ -1048,9 +1034,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"st1b { z24.b }, p1, [x23]\n"
"addvl x23, x23, #1\n"
"45:" // Height 3: Writeback done
- "mov x19, #0x0\n"
- "incw x19, ALL, MUL #4\n"
- "subs x12, x12, x19\n"
+ "decw x12, ALL, MUL #4\n"
+ "cmp x12, XZR\n"
"bgt 33b\n"
"b 62f\n"
"46:" // Height 4
@@ -1347,33 +1332,23 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
"cmp x28, x19\n"
"bne 50b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "prfm pstl1keep, [x21, #0x0]\n"
"tbnz %x[flags], #31, 58f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"neg z4.s, p2/M, z4.s\n"
- "mov x20, #0x4\n"
"mov x19, #0x4\n"
- "whilelt p0.s, XZR, x20\n"
- "uaddv d11, p0, z11.s\n"
"whilelt p0.s, XZR, x19\n"
+ "uaddv d11, p0, z11.s\n"
"uaddv d12, p0, z12.s\n"
- "mov x19, #0x4\n"
+ "uaddv d13, p0, z13.s\n"
+ "uaddv d14, p0, z14.s\n"
"mov z11.s, z11.s[0]\n"
- "whilelt p0.s, XZR, x19\n"
- "mov x19, #0x4\n"
"mov z12.s, z12.s[0]\n"
- "uaddv d13, p0, z13.s\n"
- "whilelt p0.s, XZR, x19\n"
+ "mov z13.s, z13.s[0]\n"
+ "mov z14.s, z14.s[0]\n"
"mul z11.s, p2/M, z11.s, z4.s\n"
- "uaddv d14, p0, z14.s\n"
"mul z12.s, p2/M, z12.s, z4.s\n"
- "mov z13.s, z13.s[0]\n"
"mul z13.s, p2/M, z13.s, z4.s\n"
- "mov z14.s, z14.s[0]\n"
"mul z14.s, p2/M, z14.s, z4.s\n"
"58:" // Height 4: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
@@ -1575,9 +1550,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
"st1b { z28.b }, p1, [x21]\n"
"addvl x21, x21, #1\n"
"60:" // Height 4: Writeback done
- "mov x19, #0x0\n"
- "incw x19, ALL, MUL #4\n"
- "subs x12, x12, x19\n"
+ "decw x12, ALL, MUL #4\n"
+ "cmp x12, XZR\n"
"bgt 48b\n"
"subs %x[M], %x[M], #0x4\n"
"beq 62f\n"
@@ -1592,7 +1566,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"b 1b\n"
"62:" // Exit
- : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);