diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-01-14 13:43:40 +0000 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-01-18 12:39:30 +0000 |
commit | 33e03074c36d85de87e9032a2583b04ce8ddcd6b (patch) | |
tree | 4442ec5b5022fa5681d689e6ccf3a6423efa8a93 /src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL | |
parent | 8d5337ef18901f1b54d0c062ae7486bc5a4c6610 (diff) | |
download | ComputeLibrary-33e03074c36d85de87e9032a2583b04ce8ddcd6b.tar.gz |
Cycle estimate-based kernel selection for dot product quantized s8/u8 kernels
Resolves: COMPMID-3990
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: If840c79209940535450f4ea1cbf6b0ec646a168e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4866
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp | 64 |
1 files changed, 19 insertions, 45 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp index 2b1448bd65..3c778bfe94 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -226,7 +226,6 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" "cmp x28, x19\n" "bne 5b\n" - "prfm pstl1keep, [x9, #0x0]\n" "tbnz %x[flags], #31, 13f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z1.s }, p2/Z, [x19]\n" @@ -301,9 +300,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "st1b { z16.b }, p1, [x9]\n" "addvl x9, x9, #1\n" "15:" // Height 1: Writeback done - "mov x19, #0x0\n" - "incw x19, ALL, MUL #4\n" - "subs x12, x12, x19\n" + "decw x12, ALL, MUL #4\n" + "cmp x12, XZR\n" "bgt 3b\n" "b 62f\n" "16:" // Height 2 @@ -498,17 +496,13 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" "cmp x28, x19\n" "bne 20b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" "tbnz %x[flags], #31, 28f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z2.s }, p2/Z, [x19]\n" "neg z2.s, p2/M, z2.s\n" - "mov x20, #0x4\n" "mov x19, #0x4\n" - "whilelt p0.s, XZR, x20\n" - "saddv d11, p0, z11.s\n" "whilelt p0.s, XZR, x19\n" + "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" "mov z11.s, z11.s[0]\n" "mov z12.s, z12.s[0]\n" @@ -624,9 +618,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "st1b { z20.b }, p1, [x25]\n" "addvl x25, x25, #1\n" "30:" // Height 2: Writeback done - "mov x19, #0x0\n" - "incw x19, ALL, MUL #4\n" - "subs x12, x12, x19\n" + "decw x12, ALL, MUL #4\n" + "cmp x12, XZR\n" "bgt 18b\n" "b 62f\n" "31:" // Height 3 @@ -871,27 +864,20 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" "cmp x28, x19\n" "bne 35b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" "tbnz %x[flags], #31, 43f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z3.s }, p2/Z, [x19]\n" "neg z3.s, p2/M, z3.s\n" - "mov x20, #0x4\n" "mov x19, #0x4\n" - "whilelt p0.s, XZR, x20\n" - "saddv d11, p0, z11.s\n" "whilelt p0.s, XZR, x19\n" + "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" - "mov x19, #0x4\n" + "saddv d13, p0, z13.s\n" "mov z11.s, z11.s[0]\n" - "whilelt p0.s, XZR, x19\n" "mov z12.s, z12.s[0]\n" - "saddv d13, p0, z13.s\n" + "mov z13.s, z13.s[0]\n" "mul z11.s, p2/M, z11.s, z3.s\n" "mul z12.s, p2/M, z12.s, z3.s\n" - "mov z13.s, z13.s[0]\n" "mul z13.s, p2/M, z13.s, z3.s\n" "43:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" @@ -1048,9 +1034,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "st1b { z24.b }, p1, [x23]\n" "addvl x23, x23, #1\n" "45:" // Height 3: Writeback done - "mov x19, #0x0\n" - "incw x19, ALL, MUL #4\n" - "subs x12, x12, x19\n" + "decw x12, ALL, MUL #4\n" + "cmp x12, XZR\n" "bgt 33b\n" "b 62f\n" "46:" // Height 4 @@ -1347,33 +1332,23 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" "cmp x28, x19\n" "bne 50b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "tbnz %x[flags], #31, 58f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "neg z4.s, p2/M, z4.s\n" - "mov x20, #0x4\n" "mov x19, #0x4\n" - "whilelt p0.s, XZR, x20\n" - "saddv d11, p0, z11.s\n" "whilelt p0.s, XZR, x19\n" + "saddv d11, p0, z11.s\n" "saddv d12, p0, z12.s\n" - "mov x19, #0x4\n" + "saddv d13, p0, z13.s\n" + "saddv d14, p0, z14.s\n" "mov z11.s, z11.s[0]\n" - "whilelt p0.s, XZR, x19\n" - "mov x19, #0x4\n" "mov z12.s, z12.s[0]\n" - "saddv d13, p0, z13.s\n" - "whilelt p0.s, XZR, x19\n" + "mov z13.s, z13.s[0]\n" + "mov z14.s, z14.s[0]\n" "mul z11.s, p2/M, z11.s, z4.s\n" - "saddv d14, p0, z14.s\n" "mul z12.s, p2/M, z12.s, z4.s\n" - "mov z13.s, z13.s[0]\n" "mul z13.s, p2/M, z13.s, z4.s\n" - "mov z14.s, z14.s[0]\n" "mul z14.s, p2/M, z14.s, z4.s\n" "58:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" @@ -1575,9 +1550,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "st1b { z28.b }, p1, [x21]\n" "addvl x21, x21, #1\n" "60:" // Height 4: Writeback done - "mov x19, #0x0\n" - "incw x19, ALL, MUL #4\n" - "subs x12, x12, x19\n" + "decw x12, ALL, MUL #4\n" + "cmp x12, XZR\n" "bgt 48b\n" "subs %x[M], %x[M], #0x4\n" "beq 62f\n" @@ -1592,7 +1566,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "b 1b\n" "62:" // Exit - : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr) + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); |