From e28cf395b57a091d0850cd28cecc81046153b843 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Sun, 31 Jan 2021 05:18:43 +0000 Subject: Regenerate kernels and update A55 versions Signed-off-by: Georgios Pinitas Change-Id: I9eae76c77db03b8806af65729da34ab2d77f95f2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4965 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- Android.bp | 8 + .../a32_interleave6_block1_fp32_fp32.hpp | 3 - .../a64_interleave4_block16_s8_s8.hpp | 14 +- .../a64_interleave4_block16_s8_s8_summing.hpp | 24 +- .../a64_interleave4_block16_u8_u8_summing.hpp | 24 +- .../a64_interleave8_block1_bf16_fp32.hpp | 156 +- .../a64_interleave8_block1_fp16_fp16.hpp | 170 +- .../a64_interleave8_block1_fp16_fp32.hpp | 176 +- .../a64_interleave8_block1_fp32_fp32.hpp | 144 +- .../a64_interleave8_block1_s16_s16.hpp | 170 +- .../a64_interleave8_block1_s16_s16_summing.hpp | 200 +- .../a64_interleave8_block1_s8_s16.hpp | 258 +- .../a64_interleave8_block1_s8_s16_summing.hpp | 240 +- .../a64_interleave8_block1_u16_u16_summing.hpp | 200 +- .../a64_interleave8_block1_u8_u16.hpp | 258 +- .../a64_interleave8_block1_u8_u16_summing.hpp | 240 +- .../a64_interleave8_block2_bf16_bf16.hpp | 202 +- .../a64_interleave8_block2_fp32_fp32.hpp | 74 +- .../a64_interleave8_block4_bf16_bf16.hpp | 92 +- .../a64_interleave8_block4_s8_s8.hpp | 302 +- .../a64_interleave8_block4_s8_s8_summing.hpp | 322 +- .../a64_interleave8_block4_u8_u8_summing.hpp | 322 +- .../a64_interleave8_block8_s8_s8.hpp | 128 +- .../a64_interleave8_block8_s8_s8_summing.hpp | 68 +- .../a64_interleave8_block8_u8_u8_summing.hpp | 68 +- .../kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp | 7 +- .../kernels/a64_hybrid_bf16fp32_dot_6x16.hpp | 2 - .../a64_hybrid_bf16fp32_dot_6x16/generic.cpp | 4364 +++++++------ .../arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp | 17 +- .../kernels/a64_hybrid_fp16_mla_6x32/a55.cpp | 5757 +++++++++++++++++ .../kernels/a64_hybrid_fp16_mla_6x32/generic.cpp | 6689 ++++++++++---------- .../arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp | 18 +- .../kernels/a64_hybrid_fp32_mla_6x16/a55.cpp | 3595 +++++++++++ .../kernels/a64_hybrid_fp32_mla_6x16/generic.cpp | 4192 ++++++------ .../arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp | 13 +- .../kernels/a64_hybrid_fp32_mla_8x4/a55.cpp | 2215 +++++++ .../kernels/a64_hybrid_fp32_mla_8x4/generic.cpp | 3175 +++++----- .../arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp | 15 +- .../kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp | 2148 +++++++ .../kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp | 2336 ++++--- .../arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp | 17 +- .../kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp | 3772 +++++++++++ .../kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp | 3289 +++++----- .../arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp | 17 +- .../kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp | 3499 ++++++++++ .../kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp | 4440 +++++++------ .../arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp | 15 +- .../kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp | 2148 +++++++ .../kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp | 2336 ++++--- .../arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp | 17 +- .../kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp | 3499 ++++++++++ .../kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp | 4440 +++++++------ .../kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp | 2 - .../sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp | 1976 +++--- .../arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp | 2 - .../kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp | 2516 ++++---- .../arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp | 2 - .../kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp | 1976 +++--- .../arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp | 2 - .../kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp | 2308 ++++--- .../arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp | 2 - .../kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp | 1338 ++-- .../arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp | 2 - .../kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp | 1903 +++--- .../kernels/sve_hybrid_s8s32_dot_6x4VL.hpp | 2 - .../kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp | 1835 +++--- .../arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp | 2 - .../kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp | 1340 ++-- .../kernels/sve_hybrid_u8u32_dot_6x4VL.hpp | 2 - .../kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp | 1835 +++--- .../arm_gemm/merges/a32_merge_float_8x6.hpp | 3 - 71 files changed, 54231 insertions(+), 28712 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp diff --git a/Android.bp b/Android.bp index d3a5b0b924..dc6c702011 100644 --- a/Android.bp +++ b/Android.bp @@ -889,13 +889,21 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp index 074299997d..807511f0d2 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp @@ -22,9 +22,6 @@ * SOFTWARE. */ #pragma once -#if (defined(__GNUC__) && (__GNUC__ >= 7)) -#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" -#endif #ifdef __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp index 8054c2b96b..6a8caf6ce6 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -58,19 +58,19 @@ void interleave_block<4, 16, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q19, [x22], #0x10\n" - "prfm pldl1keep, [x22, #0x70]\n" + "subs %x[width], %x[width], #0x10\n" "ldr q18, [x21], #0x10\n" + "cmp %x[width], #0x10\n" "ldr q17, [x20], #0x10\n" - "prfm pldl1keep, [x21, #0x70]\n" "ldr q16, [x19], #0x10\n" + "prfm pldl1keep, [x22, #0x70]\n" + "prfm pldl1keep, [x21, #0x70]\n" "prfm pldl1keep, [x20, #0x70]\n" + "prfm pldl1keep, [x19, #0x70]\n" "str q19, [%x[out_ptr], #0x0]\n" "str q18, [%x[out_ptr], #0x10]\n" - "prfm pldl1keep, [x19, #0x70]\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" - "subs %x[width], %x[width], #0x10\n" - "cmp %x[width], #0x10\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "bge 2b\n" "3:" // Main loop skip @@ -171,7 +171,7 @@ void interleave_block<4, 16, VLType::None, false>( "add %x[out_ptr], %x[out_ptr], #0x40\n" "12:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp index 1650916f9f..954a86656e 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -84,24 +84,24 @@ void interleave_block<4, 16, VLType::None, true>( "mov x22, #0x0\n" "4:" // no_accumulate_16 "ldr q19, [x23], #0x10\n" - "prfm pldl1keep, [x23, #0x70]\n" + "add x22, x22, #0x1\n" "ldr q18, [x21], #0x10\n" + "subs %x[width], %x[width], #0x10\n" "ldr q17, [x20], #0x10\n" - "prfm pldl1keep, [x21, #0x70]\n" + "cmp %x[width], #0x10\n" "ldr q16, [x19], #0x10\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q19, [%x[out_ptr], #0x0]\n" "sadalp v28.8h, v19.16b\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x21, #0x70]\n" + "sadalp v27.8h, v18.16b\n" + "prfm pldl1keep, [x20, #0x70]\n" + "sadalp v26.8h, v17.16b\n" "prfm pldl1keep, [x19, #0x70]\n" + "sadalp v25.8h, v16.16b\n" + "str q19, [%x[out_ptr], #0x0]\n" "str q18, [%x[out_ptr], #0x10]\n" - "sadalp v27.8h, v18.16b\n" "str q17, [%x[out_ptr], #0x20]\n" - "sadalp v26.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x30]\n" - "sadalp v25.8h, v16.16b\n" - "add x22, x22, #0x1\n" - "subs %x[width], %x[width], #0x10\n" - "cmp %x[width], #0x10\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "bge 3b\n" "5:" // Main loop skip @@ -215,7 +215,7 @@ void interleave_block<4, 16, VLType::None, true>( "add v24.4s, v24.4s, v20.4s\n" "str q24, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp index af3efb25b2..c81146212c 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -84,24 +84,24 @@ void interleave_block<4, 16, VLType::None, true>( "mov x22, #0x0\n" "4:" // no_accumulate_16 "ldr q19, [x23], #0x10\n" - "prfm pldl1keep, [x23, #0x70]\n" + "add x22, x22, #0x1\n" "ldr q18, [x21], #0x10\n" + "subs %x[width], %x[width], #0x10\n" "ldr q17, [x20], #0x10\n" - "prfm pldl1keep, [x21, #0x70]\n" + "cmp %x[width], #0x10\n" "ldr q16, [x19], #0x10\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q19, [%x[out_ptr], #0x0]\n" "uadalp v28.8h, v19.16b\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x21, #0x70]\n" + "uadalp v27.8h, v18.16b\n" + "prfm pldl1keep, [x20, #0x70]\n" + "uadalp v26.8h, v17.16b\n" "prfm pldl1keep, [x19, #0x70]\n" + "uadalp v25.8h, v16.16b\n" + "str q19, [%x[out_ptr], #0x0]\n" "str q18, [%x[out_ptr], #0x10]\n" - "uadalp v27.8h, v18.16b\n" "str q17, [%x[out_ptr], #0x20]\n" - "uadalp v26.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x30]\n" - "uadalp v25.8h, v16.16b\n" - "add x22, x22, #0x1\n" - "subs %x[width], %x[width], #0x10\n" - "cmp %x[width], #0x10\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "bge 3b\n" "5:" // Main loop skip @@ -215,7 +215,7 @@ void interleave_block<4, 16, VLType::None, true>( "add v24.4s, v24.4s, v20.4s\n" "str q24, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp index 34d25f27b8..42574295f1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,7 +31,7 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "movi v29.8h, #0x0\n" + "movi v30.8h, #0x0\n" "ldr x27, [%x[in], #0x0]\n" "cmp %x[height], #0x8\n" "ldr x26, [%x[in], #0x8]\n" @@ -80,53 +80,53 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d28, [x27], #0x8\n" - "zip1 v28.8h, v29.8h, v28.8h\n" + "ldr d29, [x27], #0x8\n" + "zip1 v29.8h, v30.8h, v29.8h\n" + "ldr d28, [x26], #0x8\n" + "subs %x[width], %x[width], #0x4\n" + "zip1 v28.8h, v30.8h, v28.8h\n" + "ldr d24, [x25], #0x8\n" + "cmp %x[width], #0x4\n" + "zip1 v24.8h, v30.8h, v24.8h\n" + "ldr d27, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "zip1 v25.4s, v29.4s, v24.4s\n" + "zip2 v24.4s, v29.4s, v24.4s\n" + "ldr d23, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "zip1 v27.8h, v30.8h, v27.8h\n" + "ldr d21, [x20], #0x8\n" + "zip1 v26.8h, v30.8h, v26.8h\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr d27, [x26], #0x8\n" - "zip1 v27.8h, v29.8h, v27.8h\n" + "zip1 v20.4s, v28.4s, v27.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr d26, [x25], #0x8\n" - "zip1 v26.8h, v29.8h, v26.8h\n" + "zip1 v23.8h, v30.8h, v23.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr d25, [x24], #0x8\n" - "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v22.8h, v30.8h, v22.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v25.8h, v29.8h, v25.8h\n" - "ldr d24, [x23], #0x8\n" - "zip1 v19.4s, v27.4s, v25.4s\n" + "zip1 v21.8h, v30.8h, v21.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v24.8h, v29.8h, v24.8h\n" - "ldr d23, [x22], #0x8\n" - "zip1 v16.4s, v20.4s, v19.4s\n" + "zip1 v17.4s, v25.4s, v20.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v23.8h, v29.8h, v23.8h\n" - "ldr d22, [x21], #0x8\n" - "zip2 v19.4s, v20.4s, v19.4s\n" + "zip1 v19.4s, v26.4s, v22.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip1 v22.8h, v29.8h, v22.8h\n" - "ldr d21, [x20], #0x8\n" - "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v18.4s, v23.4s, v21.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v21.8h, v29.8h, v21.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v17.4s, v23.4s, v21.4s\n" - "subs %x[width], %x[width], #0x4\n" - "zip2 v20.4s, v28.4s, v26.4s\n" - "cmp %x[width], #0x4\n" - "zip1 v16.4s, v18.4s, v17.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.4s, v25.4s, v20.4s\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v16.4s, v18.4s, v17.4s\n" - "str q19, [%x[out_ptr], #0x20]\n" - "zip2 v19.4s, v27.4s, v25.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v19.4s, v28.4s, v27.4s\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip1 v16.4s, v20.4s, v19.4s\n" + "zip1 v16.4s, v24.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v18.4s, v26.4s, v22.4s\n" "zip2 v17.4s, v23.4s, v21.4s\n" "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v16.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v24.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x60]\n" "zip2 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x70]\n" @@ -135,77 +135,77 @@ void interleave_block<8, 1, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr s28, [x27], #0x4\n" - "ldr s27, [x26], #0x4\n" - "ldr s26, [x25], #0x4\n" - "ldr s25, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "mov x19, #0x2\n" + "ldr s24, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" "ldr s23, [x22], #0x4\n" "ldr s22, [x21], #0x4\n" "ldr s21, [x20], #0x4\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 5f\n" - "ld1 { v28.h }[2], [x27]\n" - "ld1 { v27.h }[2], [x26]\n" - "ld1 { v26.h }[2], [x25]\n" - "ld1 { v25.h }[2], [x24]\n" - "ld1 { v24.h }[2], [x23]\n" + "ld1 { v29.h }[2], [x27]\n" + "mov x19, #0x3\n" + "ld1 { v28.h }[2], [x26]\n" + "ld1 { v24.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v26.h }[2], [x23]\n" "ld1 { v23.h }[2], [x22]\n" "ld1 { v22.h }[2], [x21]\n" "ld1 { v21.h }[2], [x20]\n" - "mov x19, #0x3\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr h28, [x27, #0x0]\n" - "ldr h27, [x26, #0x0]\n" - "ldr h26, [x25, #0x0]\n" - "ldr h25, [x24, #0x0]\n" - "ldr h24, [x23, #0x0]\n" + "ldr h29, [x27, #0x0]\n" + "mov x19, #0x1\n" + "ldr h28, [x26, #0x0]\n" + "ldr h24, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h26, [x23, #0x0]\n" "ldr h23, [x22, #0x0]\n" "ldr h22, [x21, #0x0]\n" "ldr h21, [x20, #0x0]\n" - "mov x19, #0x1\n" "5:" // Odd load end - "zip1 v28.8h, v29.8h, v28.8h\n" + "zip1 v29.8h, v30.8h, v29.8h\n" "subs x19, x19, #0x1\n" - "zip1 v27.8h, v29.8h, v27.8h\n" - "zip1 v26.8h, v29.8h, v26.8h\n" - "zip1 v25.8h, v29.8h, v25.8h\n" - "zip1 v24.8h, v29.8h, v24.8h\n" - "zip1 v23.8h, v29.8h, v23.8h\n" - "zip1 v22.8h, v29.8h, v22.8h\n" - "zip1 v21.8h, v29.8h, v21.8h\n" - "zip1 v20.4s, v28.4s, v26.4s\n" - "zip1 v19.4s, v27.4s, v25.4s\n" - "zip1 v16.4s, v20.4s, v19.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v24.4s, v22.4s\n" - "zip1 v17.4s, v23.4s, v21.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" + "zip1 v28.8h, v30.8h, v28.8h\n" + "zip1 v24.8h, v30.8h, v24.8h\n" + "zip1 v27.8h, v30.8h, v27.8h\n" + "zip1 v26.8h, v30.8h, v26.8h\n" + "zip1 v23.8h, v30.8h, v23.8h\n" + "zip1 v22.8h, v30.8h, v22.8h\n" + "zip1 v21.8h, v30.8h, v21.8h\n" + "zip1 v25.4s, v29.4s, v24.4s\n" + "zip1 v20.4s, v28.4s, v27.4s\n" + "zip1 v17.4s, v25.4s, v20.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v19.4s, v26.4s, v22.4s\n" + "zip1 v18.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v19.4s, v20.4s, v19.4s\n" - "zip2 v16.4s, v18.4s, v17.4s\n" - "str q19, [%x[out_ptr], #0x0]\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.4s, v25.4s, v20.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "subs x19, x19, #0x1\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v20.4s, v28.4s, v26.4s\n" - "zip2 v19.4s, v27.4s, v25.4s\n" - "zip1 v16.4s, v20.4s, v19.4s\n" + "zip2 v24.4s, v29.4s, v24.4s\n" + "zip2 v19.4s, v28.4s, v27.4s\n" + "zip1 v16.4s, v24.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v18.4s, v26.4s, v22.4s\n" "zip2 v17.4s, v23.4s, v21.4s\n" "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp index d547957129..62d1657a9a 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,51 +80,51 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q30, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" "ldr q29, [x26], #0x10\n" + "cmp %x[width], #0x8\n" "ldr q28, [x25], #0x10\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q27, [x24], #0x10\n" + "ldr q25, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v25.8h\n" + "ldr q21, [x22], #0x10\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "ldr q24, [x21], #0x10\n" + "ldr q23, [x20], #0x10\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q24, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q25, [x22], #0x10\n" - "zip2 v24.8h, v30.8h, v24.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q23, [x21], #0x10\n" - "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q22, [x20], #0x10\n" - "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v20.8h, v26.8h, v18.8h\n" + "zip2 v16.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "cmp %x[width], #0x8\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.8h, v20.8h, v17.8h\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.8h, v22.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x20]\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v21.8h, v28.8h, v23.8h\n" - "zip1 v18.8h, v24.8h, v21.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v24.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x60]\n" "zip2 v16.8h, v18.8h, v17.8h\n" @@ -138,129 +138,129 @@ void interleave_block<8, 1, VLType::None, false>( "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" "ldr d27, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" - "ldr d23, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d23, [x20], #0x8\n" "tbz %x[width], #1, 4f\n" "ld1 { v30.s }[2], [x27], #0x4\n" + "mov x19, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v24.s }[2], [x23], #0x4\n" - "ld1 { v25.s }[2], [x22], #0x4\n" - "ld1 { v23.s }[2], [x21], #0x4\n" - "ld1 { v22.s }[2], [x20], #0x4\n" - "mov x19, #0x6\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v24.s }[2], [x21], #0x4\n" + "ld1 { v23.s }[2], [x20], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v30.h }[6], [x27]\n" + "mov x19, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" "ld1 { v27.h }[6], [x24]\n" - "ld1 { v24.h }[6], [x23]\n" - "ld1 { v25.h }[6], [x22]\n" - "ld1 { v23.h }[6], [x21]\n" - "ld1 { v22.h }[6], [x20]\n" - "mov x19, #0x7\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "ld1 { v24.h }[6], [x21]\n" + "ld1 { v23.h }[6], [x20]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v30.h }[4], [x27]\n" "ld1 { v29.h }[4], [x26]\n" + "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" "ld1 { v27.h }[4], [x24]\n" - "ld1 { v24.h }[4], [x23]\n" - "ld1 { v25.h }[4], [x22]\n" - "ld1 { v23.h }[4], [x21]\n" - "ld1 { v22.h }[4], [x20]\n" - "mov x19, #0x5\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "ld1 { v24.h }[4], [x21]\n" + "ld1 { v23.h }[4], [x20]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" "ldr s30, [x27], #0x4\n" "ldr s29, [x26], #0x4\n" + "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" - "ldr s23, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" - "mov x19, #0x2\n" + "ldr s25, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" + "ldr s23, [x20], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v30.h }[2], [x27]\n" + "mov x19, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" "ld1 { v27.h }[2], [x24]\n" - "ld1 { v24.h }[2], [x23]\n" - "ld1 { v25.h }[2], [x22]\n" - "ld1 { v23.h }[2], [x21]\n" - "ld1 { v22.h }[2], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "ld1 { v24.h }[2], [x21]\n" + "ld1 { v23.h }[2], [x20]\n" "b 7f\n" "6:" // odd_loads_1_0 "ldr h30, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" "ldr h27, [x24, #0x0]\n" - "ldr h24, [x23, #0x0]\n" - "ldr h25, [x22, #0x0]\n" - "ldr h23, [x21, #0x0]\n" - "ldr h22, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr h25, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "ldr h24, [x21, #0x0]\n" + "ldr h23, [x20, #0x0]\n" "7:" // Odd load end - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v26.8h, v30.8h, v25.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v28.8h, v23.8h\n" - "zip1 v20.8h, v26.8h, v18.8h\n" - "zip1 v21.8h, v29.8h, v25.8h\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v20.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v18.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" "subs x19, x19, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v24.8h, v30.8h, v24.8h\n" - "zip2 v21.8h, v28.8h, v23.8h\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v21.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v24.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp index b45e622a47..b67840b280 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,132 +79,132 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d29, [x27], #0x8\n" + "ldr d30, [x27], #0x8\n" + "subs %x[width], %x[width], #0x4\n" + "ldr d29, [x26], #0x8\n" + "cmp %x[width], #0x4\n" + "ldr d28, [x25], #0x8\n" + "fcvtl v30.4s, v30.4h\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "fcvtl v29.4s, v29.4h\n" + "ldr d26, [x22], #0x8\n" + "fcvtl v28.4s, v28.4h\n" + "zip1 v20.4s, v30.4s, v28.4s\n" + "ldr d25, [x21], #0x8\n" + "fcvtl v21.4s, v21.4h\n" + "zip2 v17.4s, v30.4s, v28.4s\n" + "ldr d24, [x20], #0x8\n" + "fcvtl v27.4s, v27.4h\n" + "zip1 v18.4s, v29.4s, v21.4s\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr d28, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" + "fcvtl v26.4s, v26.4h\n" + "zip1 v23.4s, v20.4s, v18.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr d26, [x24], #0x8\n" + "fcvtl v25.4s, v25.4h\n" + "zip2 v22.4s, v20.4s, v18.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr d25, [x23], #0x8\n" - "ldr d24, [x22], #0x8\n" + "fcvtl v24.4s, v24.4h\n" + "zip2 v16.4s, v29.4s, v21.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr d23, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" "prfm pldl1keep, [x23, #0x70]\n" + "zip1 v21.4s, v17.4s, v16.4s\n" + "zip2 v20.4s, v17.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "fcvtl v29.4s, v29.4h\n" - "fcvtl v28.4s, v28.4h\n" "prfm pldl1keep, [x21, #0x70]\n" - "fcvtl v27.4s, v27.4h\n" - "zip1 v20.4s, v29.4s, v27.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" + "zip2 v18.4s, v27.4s, v25.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "fcvtl v26.4s, v26.4h\n" - "zip2 v18.4s, v29.4s, v27.4s\n" - "fcvtl v25.4s, v25.4h\n" - "fcvtl v24.4s, v24.4h\n" - "zip1 v19.4s, v28.4s, v26.4s\n" - "fcvtl v23.4s, v23.4h\n" - "zip2 v17.4s, v28.4s, v26.4s\n" - "fcvtl v22.4s, v22.4h\n" - "zip1 v16.4s, v20.4s, v19.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v21.4s, v20.4s, v19.4s\n" - "subs %x[width], %x[width], #0x4\n" - "zip1 v20.4s, v18.4s, v17.4s\n" - "cmp %x[width], #0x4\n" - "zip2 v19.4s, v18.4s, v17.4s\n" - "zip1 v18.4s, v25.4s, v23.4s\n" - "zip1 v17.4s, v24.4s, v22.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" + "zip1 v17.4s, v26.4s, v24.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v19.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v16.4s, v18.4s, v17.4s\n" - "str q21, [%x[out_ptr], #0x20]\n" - "zip2 v18.4s, v25.4s, v23.4s\n" - "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v17.4s, v24.4s, v22.4s\n" - "str q20, [%x[out_ptr], #0x40]\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v16.4s, v18.4s, v17.4s\n" - "str q19, [%x[out_ptr], #0x60]\n" + "zip2 v17.4s, v19.4s, v17.4s\n" + "str q22, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "str q17, [%x[out_ptr], #0x30]\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q21, [%x[out_ptr], #0x40]\n" + "zip2 v16.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x50]\n" + "str q20, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr s29, [x27], #0x4\n" - "ldr s28, [x26], #0x4\n" - "ldr s27, [x25], #0x4\n" - "ldr s26, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s24, [x22], #0x4\n" - "ldr s23, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" "mov x19, #0x2\n" + "ldr s28, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" + "ldr s25, [x21], #0x4\n" + "ldr s24, [x20], #0x4\n" "tbz %x[width], #0, 5f\n" - "ld1 { v29.h }[2], [x27]\n" - "ld1 { v28.h }[2], [x26]\n" - "ld1 { v27.h }[2], [x25]\n" - "ld1 { v26.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v24.h }[2], [x22]\n" - "ld1 { v23.h }[2], [x21]\n" - "ld1 { v22.h }[2], [x20]\n" + "ld1 { v30.h }[2], [x27]\n" "mov x19, #0x3\n" + "ld1 { v29.h }[2], [x26]\n" + "ld1 { v28.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v27.h }[2], [x23]\n" + "ld1 { v26.h }[2], [x22]\n" + "ld1 { v25.h }[2], [x21]\n" + "ld1 { v24.h }[2], [x20]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr h29, [x27, #0x0]\n" - "ldr h28, [x26, #0x0]\n" - "ldr h27, [x25, #0x0]\n" - "ldr h26, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h24, [x22, #0x0]\n" - "ldr h23, [x21, #0x0]\n" - "ldr h22, [x20, #0x0]\n" + "ldr h30, [x27, #0x0]\n" "mov x19, #0x1\n" + "ldr h29, [x26, #0x0]\n" + "ldr h28, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "ldr h26, [x22, #0x0]\n" + "ldr h25, [x21, #0x0]\n" + "ldr h24, [x20, #0x0]\n" "5:" // Odd load end + "fcvtl v30.4s, v30.4h\n" "fcvtl v29.4s, v29.4h\n" "fcvtl v28.4s, v28.4h\n" + "zip1 v20.4s, v30.4s, v28.4s\n" + "fcvtl v21.4s, v21.4h\n" "fcvtl v27.4s, v27.4h\n" - "zip1 v20.4s, v29.4s, v27.4s\n" + "zip1 v18.4s, v29.4s, v21.4s\n" "fcvtl v26.4s, v26.4h\n" "fcvtl v25.4s, v25.4h\n" - "zip1 v19.4s, v28.4s, v26.4s\n" + "zip1 v23.4s, v20.4s, v18.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v19.4s, v27.4s, v25.4s\n" "fcvtl v24.4s, v24.4h\n" - "fcvtl v23.4s, v23.4h\n" - "zip1 v16.4s, v20.4s, v19.4s\n" - "fcvtl v22.4s, v22.4h\n" - "zip1 v18.4s, v25.4s, v23.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "zip1 v17.4s, v24.4s, v22.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" + "zip1 v17.4s, v26.4s, v24.4s\n" + "zip1 v16.4s, v19.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v21.4s, v20.4s, v19.4s\n" - "zip2 v16.4s, v18.4s, v17.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v20.4s, v18.4s\n" + "str q22, [%x[out_ptr], #0x0]\n" + "zip2 v17.4s, v19.4s, v17.4s\n" "subs x19, x19, #0x1\n" + "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v18.4s, v29.4s, v27.4s\n" - "zip2 v17.4s, v28.4s, v26.4s\n" - "zip1 v20.4s, v18.4s, v17.4s\n" - "str q20, [%x[out_ptr], #0x0]\n" - "zip2 v18.4s, v25.4s, v23.4s\n" - "zip2 v17.4s, v24.4s, v22.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.4s, v30.4s, v28.4s\n" + "zip2 v16.4s, v29.4s, v21.4s\n" + "zip1 v21.4s, v17.4s, v16.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v18.4s, v27.4s, v25.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp index 3f38859c1c..eefb8549ea 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,45 +80,45 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q28, [x27], #0x10\n" + "subs %x[width], %x[width], #0x4\n" + "ldr q29, [x26], #0x10\n" + "cmp %x[width], #0x4\n" + "ldr q25, [x25], #0x10\n" + "zip1 v22.4s, v28.4s, v25.4s\n" + "ldr q21, [x24], #0x10\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "ldr q27, [x23], #0x10\n" + "ldr q26, [x22], #0x10\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "ldr q19, [x21], #0x10\n" + "zip2 v25.4s, v29.4s, v21.4s\n" + "ldr q24, [x20], #0x10\n" + "zip1 v23.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr q27, [x26], #0x10\n" - "ldr q26, [x25], #0x10\n" - "zip1 v23.4s, v28.4s, v26.4s\n" + "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr q22, [x24], #0x10\n" - "zip2 v26.4s, v28.4s, v26.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q25, [x23], #0x10\n" - "zip1 v20.4s, v27.4s, v22.4s\n" + "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q24, [x22], #0x10\n" - "zip1 v16.4s, v23.4s, v20.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v23.4s, v23.4s, v20.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v22.4s, v27.4s, v22.4s\n" - "ldr q21, [x20], #0x10\n" - "zip1 v18.4s, v25.4s, v19.4s\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v20.4s, v26.4s, v22.4s\n" + "zip2 v19.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v16.4s, v24.4s, v21.4s\n" - "subs %x[width], %x[width], #0x4\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "cmp %x[width], #0x4\n" - "zip2 v16.4s, v18.4s, v16.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v19.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "str q23, [%x[out_ptr], #0x20]\n" - "zip2 v18.4s, v24.4s, v21.4s\n" - "str q16, [%x[out_ptr], #0x30]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q20, [%x[out_ptr], #0x40]\n" - "zip2 v17.4s, v26.4s, v22.4s\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" + "str q22, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" @@ -127,68 +127,68 @@ void interleave_block<8, 1, VLType::None, false>( "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" "ldr d28, [x27], #0x8\n" - "ldr d27, [x26], #0x8\n" - "ldr d26, [x25], #0x8\n" - "ldr d22, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d24, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d21, [x20], #0x8\n" + "ldr d29, [x26], #0x8\n" "mov x19, #0x2\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" "tbz %x[width], #0, 5f\n" "ld1 { v28.s }[2], [x27]\n" - "ld1 { v27.s }[2], [x26]\n" - "ld1 { v26.s }[2], [x25]\n" - "ld1 { v22.s }[2], [x24]\n" - "ld1 { v25.s }[2], [x23]\n" - "ld1 { v24.s }[2], [x22]\n" - "ld1 { v19.s }[2], [x21]\n" - "ld1 { v21.s }[2], [x20]\n" "mov x19, #0x3\n" + "ld1 { v29.s }[2], [x26]\n" + "ld1 { v25.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "ld1 { v27.s }[2], [x23]\n" + "ld1 { v26.s }[2], [x22]\n" + "ld1 { v19.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" "b 5f\n" "4:" // odd_loads_1_0 "ldr s28, [x27, #0x0]\n" - "ldr s27, [x26, #0x0]\n" - "ldr s26, [x25, #0x0]\n" - "ldr s22, [x24, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "ldr s24, [x22, #0x0]\n" - "ldr s19, [x21, #0x0]\n" - "ldr s21, [x20, #0x0]\n" "mov x19, #0x1\n" + "ldr s29, [x26, #0x0]\n" + "ldr s25, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "ldr s26, [x22, #0x0]\n" + "ldr s19, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" "5:" // Odd load end - "zip1 v23.4s, v28.4s, v26.4s\n" + "zip1 v22.4s, v28.4s, v25.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v27.4s, v22.4s\n" - "zip1 v16.4s, v23.4s, v20.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v24.4s, v21.4s\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "zip1 v23.4s, v22.4s, v20.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "zip1 v17.4s, v18.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v23.4s, v23.4s, v20.4s\n" - "zip2 v16.4s, v18.4s, v16.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v22.4s, v20.4s\n" + "str q22, [%x[out_ptr], #0x0]\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "subs x19, x19, #0x1\n" + "str q20, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v26.4s, v28.4s, v26.4s\n" - "zip2 v22.4s, v27.4s, v22.4s\n" - "zip1 v20.4s, v26.4s, v22.4s\n" - "str q20, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "zip2 v18.4s, v24.4s, v21.4s\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "zip2 v25.4s, v29.4s, v21.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v27.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp index 03f552a575..b0523b96ce 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,51 +80,51 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q30, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" "ldr q29, [x26], #0x10\n" + "cmp %x[width], #0x8\n" "ldr q28, [x25], #0x10\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q27, [x24], #0x10\n" + "ldr q25, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v25.8h\n" + "ldr q21, [x22], #0x10\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "ldr q24, [x21], #0x10\n" + "ldr q23, [x20], #0x10\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q24, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q25, [x22], #0x10\n" - "zip2 v24.8h, v30.8h, v24.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q23, [x21], #0x10\n" - "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q22, [x20], #0x10\n" - "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v20.8h, v26.8h, v18.8h\n" + "zip2 v16.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "cmp %x[width], #0x8\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.8h, v20.8h, v17.8h\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.8h, v22.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x20]\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v21.8h, v28.8h, v23.8h\n" - "zip1 v18.8h, v24.8h, v21.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v24.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x60]\n" "zip2 v16.8h, v18.8h, v17.8h\n" @@ -138,129 +138,129 @@ void interleave_block<8, 1, VLType::None, false>( "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" "ldr d27, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" - "ldr d23, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d23, [x20], #0x8\n" "tbz %x[width], #1, 4f\n" "ld1 { v30.s }[2], [x27], #0x4\n" + "mov x19, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v24.s }[2], [x23], #0x4\n" - "ld1 { v25.s }[2], [x22], #0x4\n" - "ld1 { v23.s }[2], [x21], #0x4\n" - "ld1 { v22.s }[2], [x20], #0x4\n" - "mov x19, #0x6\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v24.s }[2], [x21], #0x4\n" + "ld1 { v23.s }[2], [x20], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v30.h }[6], [x27]\n" + "mov x19, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" "ld1 { v27.h }[6], [x24]\n" - "ld1 { v24.h }[6], [x23]\n" - "ld1 { v25.h }[6], [x22]\n" - "ld1 { v23.h }[6], [x21]\n" - "ld1 { v22.h }[6], [x20]\n" - "mov x19, #0x7\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "ld1 { v24.h }[6], [x21]\n" + "ld1 { v23.h }[6], [x20]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v30.h }[4], [x27]\n" "ld1 { v29.h }[4], [x26]\n" + "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" "ld1 { v27.h }[4], [x24]\n" - "ld1 { v24.h }[4], [x23]\n" - "ld1 { v25.h }[4], [x22]\n" - "ld1 { v23.h }[4], [x21]\n" - "ld1 { v22.h }[4], [x20]\n" - "mov x19, #0x5\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "ld1 { v24.h }[4], [x21]\n" + "ld1 { v23.h }[4], [x20]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" "ldr s30, [x27], #0x4\n" "ldr s29, [x26], #0x4\n" + "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" - "ldr s23, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" - "mov x19, #0x2\n" + "ldr s25, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" + "ldr s23, [x20], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v30.h }[2], [x27]\n" + "mov x19, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" "ld1 { v27.h }[2], [x24]\n" - "ld1 { v24.h }[2], [x23]\n" - "ld1 { v25.h }[2], [x22]\n" - "ld1 { v23.h }[2], [x21]\n" - "ld1 { v22.h }[2], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "ld1 { v24.h }[2], [x21]\n" + "ld1 { v23.h }[2], [x20]\n" "b 7f\n" "6:" // odd_loads_1_0 "ldr h30, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" "ldr h27, [x24, #0x0]\n" - "ldr h24, [x23, #0x0]\n" - "ldr h25, [x22, #0x0]\n" - "ldr h23, [x21, #0x0]\n" - "ldr h22, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr h25, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "ldr h24, [x21, #0x0]\n" + "ldr h23, [x20, #0x0]\n" "7:" // Odd load end - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v26.8h, v30.8h, v25.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v28.8h, v23.8h\n" - "zip1 v20.8h, v26.8h, v18.8h\n" - "zip1 v21.8h, v29.8h, v25.8h\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v20.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v18.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" "subs x19, x19, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v24.8h, v30.8h, v24.8h\n" - "zip2 v21.8h, v28.8h, v23.8h\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v21.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v24.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp index 35c7719de7..292a38f401 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -96,58 +96,58 @@ void interleave_block<8, 1, VLType::None, true>( "movi v1.8h, #0x0\n" "4:" // no_accumulate_16 "ldr q30, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "add x19, x19, #0x1\n" "ldr q29, [x26], #0x10\n" + "subs %x[width], %x[width], #0x8\n" "ldr q28, [x25], #0x10\n" - "prfm pldl1keep, [x26, #0x70]\n" + "cmp %x[width], #0x8\n" "ldr q27, [x24], #0x10\n" + "ldr q25, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v25.8h\n" + "ldr q21, [x22], #0x10\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "ldr q24, [x21], #0x10\n" + "ldr q23, [x20], #0x10\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q24, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q25, [x22], #0x10\n" - "zip2 v24.8h, v30.8h, v24.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q23, [x21], #0x10\n" - "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q22, [x20], #0x10\n" - "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "add x19, x19, #0x1\n" - "zip1 v20.8h, v26.8h, v18.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "cmp %x[width], #0x8\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v20.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v16.8h, v21.8h, v19.8h\n" "add v1.8h, v1.8h, v17.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x20]\n" + "prfm pldl1keep, [x20, #0x70]\n" "zip2 v16.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "add v1.8h, v1.8h, v17.8h\n" - "zip2 v21.8h, v28.8h, v23.8h\n" - "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x60]\n" "add v1.8h, v1.8h, v16.8h\n" @@ -163,140 +163,140 @@ void interleave_block<8, 1, VLType::None, true>( "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" "ldr d27, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" - "ldr d23, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d23, [x20], #0x8\n" "tbz %x[width], #1, 6f\n" "ld1 { v30.s }[2], [x27], #0x4\n" + "mov x19, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v24.s }[2], [x23], #0x4\n" - "ld1 { v25.s }[2], [x22], #0x4\n" - "ld1 { v23.s }[2], [x21], #0x4\n" - "ld1 { v22.s }[2], [x20], #0x4\n" - "mov x19, #0x6\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v24.s }[2], [x21], #0x4\n" + "ld1 { v23.s }[2], [x20], #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.h }[6], [x27]\n" + "mov x19, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" "ld1 { v27.h }[6], [x24]\n" - "ld1 { v24.h }[6], [x23]\n" - "ld1 { v25.h }[6], [x22]\n" - "ld1 { v23.h }[6], [x21]\n" - "ld1 { v22.h }[6], [x20]\n" - "mov x19, #0x7\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "ld1 { v24.h }[6], [x21]\n" + "ld1 { v23.h }[6], [x20]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.h }[4], [x27]\n" "ld1 { v29.h }[4], [x26]\n" + "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" "ld1 { v27.h }[4], [x24]\n" - "ld1 { v24.h }[4], [x23]\n" - "ld1 { v25.h }[4], [x22]\n" - "ld1 { v23.h }[4], [x21]\n" - "ld1 { v22.h }[4], [x20]\n" - "mov x19, #0x5\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "ld1 { v24.h }[4], [x21]\n" + "ld1 { v23.h }[4], [x20]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" "ldr s30, [x27], #0x4\n" "ldr s29, [x26], #0x4\n" + "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" - "ldr s23, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" - "mov x19, #0x2\n" + "ldr s25, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" + "ldr s23, [x20], #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.h }[2], [x27]\n" + "mov x19, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" "ld1 { v27.h }[2], [x24]\n" - "ld1 { v24.h }[2], [x23]\n" - "ld1 { v25.h }[2], [x22]\n" - "ld1 { v23.h }[2], [x21]\n" - "ld1 { v22.h }[2], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "ld1 { v24.h }[2], [x21]\n" + "ld1 { v23.h }[2], [x20]\n" "b 9f\n" "8:" // odd_loads_1_0 "ldr h30, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" "ldr h27, [x24, #0x0]\n" - "ldr h24, [x23, #0x0]\n" - "ldr h25, [x22, #0x0]\n" - "ldr h23, [x21, #0x0]\n" - "ldr h22, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr h25, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "ldr h24, [x21, #0x0]\n" + "ldr h23, [x20, #0x0]\n" "9:" // Odd load end - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v26.8h, v30.8h, v25.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v28.8h, v23.8h\n" - "zip1 v20.8h, v26.8h, v18.8h\n" - "zip1 v21.8h, v29.8h, v25.8h\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v17.8h, v20.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip2 v16.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" "subs x19, x19, #0x1\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v17.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v24.8h, v30.8h, v24.8h\n" - "zip2 v21.8h, v28.8h, v23.8h\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v21.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v24.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "10:" // Odds skip "saddw v0.4s, v0.4s, v1.4h\n" "str q0, [%x[out_ptr], #0x0]\n" "saddw2 v31.4s, v31.4s, v1.8h\n" "str q31, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp index 582836fe67..6cfed8f3a4 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,206 +79,206 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d30, [x27], #0x8\n" + "ldr d31, [x27], #0x8\n" + "sshll v31.8h, v31.8b, #0x0\n" + "ldr d30, [x26], #0x8\n" + "subs %x[width], %x[width], #0x8\n" + "sshll v30.8h, v30.8b, #0x0\n" + "ldr d29, [x25], #0x8\n" + "cmp %x[width], #0x8\n" + "sshll v29.8h, v29.8b, #0x0\n" + "ldr d28, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "sshll v28.8h, v28.8b, #0x0\n" + "ldr d23, [x22], #0x8\n" + "sshll v25.8h, v25.8b, #0x0\n" + "ldr d27, [x21], #0x8\n" + "zip1 v20.8h, v31.8h, v25.8h\n" + "ldr d26, [x20], #0x8\n" + "zip2 v25.8h, v31.8h, v25.8h\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr d27, [x24], #0x8\n" + "sshll v23.8h, v23.8b, #0x0\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr d23, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "zip2 v23.8h, v30.8h, v23.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr d26, [x21], #0x8\n" - "ldr d25, [x20], #0x8\n" + "sshll v27.8h, v27.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" + "zip1 v19.8h, v29.8h, v27.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "sshll v30.8h, v30.8b, #0x0\n" - "sshll v29.8h, v29.8b, #0x0\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "sshll v28.8h, v28.8b, #0x0\n" + "zip2 v21.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x20, #0x70]\n" - "sshll v27.8h, v27.8b, #0x0\n" - "sshll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" - "sshll v21.8h, v21.8b, #0x0\n" - "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v19.8h, v29.8h, v27.8h\n" + "zip1 v20.8h, v25.8h, v19.8h\n" + "zip2 v19.8h, v25.8h, v19.8h\n" "sshll v26.8h, v26.8b, #0x0\n" - "sshll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "subs %x[width], %x[width], #0x8\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "cmp %x[width], #0x8\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v26.8h\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v19.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v19.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s23, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s26, [x21], #0x4\n" - "ldr s25, [x20], #0x4\n" + "ldr s31, [x27], #0x4\n" + "ldr s30, [x26], #0x4\n" + "ldr s29, [x25], #0x4\n" + "ldr s28, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s27, [x21], #0x4\n" + "ldr s26, [x20], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v30.h }[2], [x27], #0x2\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x25], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v21.h }[2], [x22], #0x2\n" - "ld1 { v26.h }[2], [x21], #0x2\n" - "ld1 { v25.h }[2], [x20], #0x2\n" + "ld1 { v31.h }[2], [x27], #0x2\n" "mov x19, #0x6\n" + "ld1 { v30.h }[2], [x26], #0x2\n" + "ld1 { v29.h }[2], [x25], #0x2\n" + "ld1 { v28.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x22], #0x2\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v30.b }[6], [x27]\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x25]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v21.b }[6], [x22]\n" - "ld1 { v26.b }[6], [x21]\n" - "ld1 { v25.b }[6], [x20]\n" + "ld1 { v31.b }[6], [x27]\n" "mov x19, #0x7\n" + "ld1 { v30.b }[6], [x26]\n" + "ld1 { v29.b }[6], [x25]\n" + "ld1 { v28.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x22]\n" + "ld1 { v27.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v30.b }[4], [x27]\n" - "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x25]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v21.b }[4], [x22]\n" - "ld1 { v26.b }[4], [x21]\n" - "ld1 { v25.b }[4], [x20]\n" + "ld1 { v31.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x26]\n" "mov x19, #0x5\n" + "ld1 { v29.b }[4], [x25]\n" + "ld1 { v28.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x22]\n" + "ld1 { v27.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr h30, [x27], #0x2\n" - "ldr h29, [x26], #0x2\n" - "ldr h28, [x25], #0x2\n" - "ldr h27, [x24], #0x2\n" - "ldr h23, [x23], #0x2\n" - "ldr h21, [x22], #0x2\n" - "ldr h26, [x21], #0x2\n" - "ldr h25, [x20], #0x2\n" + "ldr h31, [x27], #0x2\n" + "ldr h30, [x26], #0x2\n" "mov x19, #0x2\n" + "ldr h29, [x25], #0x2\n" + "ldr h28, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "ldr h27, [x21], #0x2\n" + "ldr h26, [x20], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v30.b }[2], [x27]\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x25]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v21.b }[2], [x22]\n" - "ld1 { v26.b }[2], [x21]\n" - "ld1 { v25.b }[2], [x20]\n" + "ld1 { v31.b }[2], [x27]\n" "mov x19, #0x3\n" + "ld1 { v30.b }[2], [x26]\n" + "ld1 { v29.b }[2], [x25]\n" + "ld1 { v28.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x22]\n" + "ld1 { v27.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr b30, [x27, #0x0]\n" - "ldr b29, [x26, #0x0]\n" - "ldr b28, [x25, #0x0]\n" - "ldr b27, [x24, #0x0]\n" - "ldr b23, [x23, #0x0]\n" - "ldr b21, [x22, #0x0]\n" - "ldr b26, [x21, #0x0]\n" - "ldr b25, [x20, #0x0]\n" + "ldr b31, [x27, #0x0]\n" "mov x19, #0x1\n" + "ldr b30, [x26, #0x0]\n" + "ldr b29, [x25, #0x0]\n" + "ldr b28, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b23, [x22, #0x0]\n" + "ldr b27, [x21, #0x0]\n" + "ldr b26, [x20, #0x0]\n" "7:" // Odd load end + "sshll v31.8h, v31.8b, #0x0\n" + "subs x19, x19, #0x1\n" "sshll v30.8h, v30.8b, #0x0\n" "sshll v29.8h, v29.8b, #0x0\n" "sshll v28.8h, v28.8b, #0x0\n" - "sshll v27.8h, v27.8b, #0x0\n" + "sshll v25.8h, v25.8b, #0x0\n" + "zip1 v20.8h, v31.8h, v25.8h\n" "sshll v23.8h, v23.8b, #0x0\n" "zip1 v24.8h, v30.8h, v23.8h\n" - "sshll v21.8h, v21.8b, #0x0\n" + "sshll v27.8h, v27.8b, #0x0\n" + "zip1 v19.8h, v29.8h, v27.8h\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "sshll v26.8h, v26.8b, #0x0\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "sshll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" + "zip2 v21.8h, v20.8h, v19.8h\n" + "zip2 v17.8h, v24.8h, v18.8h\n" "subs x19, x19, #0x1\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v23.8h, v30.8h, v23.8h\n" - "zip2 v20.8h, v28.8h, v26.8h\n" + "zip2 v25.8h, v31.8h, v25.8h\n" + "zip2 v19.8h, v29.8h, v27.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v20.8h, v25.8h, v19.8h\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v19.8h, v25.8h, v19.8h\n" + "zip2 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v19.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp index 35dc3dc0d4..b710861417 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -96,72 +96,72 @@ void interleave_block<8, 1, VLType::None, true>( "movi v1.8h, #0x0\n" "4:" // no_accumulate_16 "ldr d30, [x27], #0x8\n" - "prfm pldl1keep, [x27, #0x70]\n" + "sshll v30.8h, v30.8b, #0x0\n" "ldr d29, [x26], #0x8\n" + "add x19, x19, #0x1\n" + "sshll v29.8h, v29.8b, #0x0\n" "ldr d28, [x25], #0x8\n" - "prfm pldl1keep, [x26, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" + "sshll v28.8h, v28.8b, #0x0\n" "ldr d27, [x24], #0x8\n" + "cmp %x[width], #0x8\n" + "sshll v27.8h, v27.8b, #0x0\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "sshll v24.8h, v24.8b, #0x0\n" + "ldr d21, [x21], #0x8\n" + "sshll v23.8h, v23.8b, #0x0\n" + "ldr d26, [x20], #0x8\n" + "zip1 v20.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip1 v25.8h, v29.8h, v23.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip2 v24.8h, v30.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr d23, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" + "zip2 v23.8h, v29.8h, v23.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr d26, [x21], #0x8\n" - "ldr d25, [x20], #0x8\n" + "sshll v21.8h, v21.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" + "zip1 v19.8h, v28.8h, v21.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "sshll v30.8h, v30.8b, #0x0\n" - "sshll v29.8h, v29.8b, #0x0\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "sshll v28.8h, v28.8b, #0x0\n" + "zip2 v19.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x20, #0x70]\n" - "sshll v27.8h, v27.8b, #0x0\n" - "sshll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" - "sshll v21.8h, v21.8b, #0x0\n" - "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v28.8h, v21.8h\n" + "zip1 v21.8h, v24.8h, v20.8h\n" + "zip2 v20.8h, v24.8h, v20.8h\n" "sshll v26.8h, v26.8b, #0x0\n" - "sshll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "add x19, x19, #0x1\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "cmp %x[width], #0x8\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x20]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v16.8h, v25.8h, v18.8h\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v19.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v26.8h\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip2 v19.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" "add v1.8h, v1.8h, v16.8h\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v18.8h, v21.8h, v17.8h\n" + "str q18, [%x[out_ptr], #0x50]\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x60]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x60]\n" + "add v1.8h, v1.8h, v18.8h\n" + "zip2 v16.8h, v20.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" + "add v1.8h, v1.8h, v17.8h\n" "add v1.8h, v1.8h, v16.8h\n" "bge 3b\n" "5:" // Main loop skip @@ -171,148 +171,148 @@ void interleave_block<8, 1, VLType::None, true>( "ldr s29, [x26], #0x4\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s23, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s26, [x21], #0x4\n" - "ldr s25, [x20], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s21, [x21], #0x4\n" + "ldr s26, [x20], #0x4\n" "tbz %x[width], #1, 6f\n" "ld1 { v30.h }[2], [x27], #0x2\n" + "mov x19, #0x6\n" "ld1 { v29.h }[2], [x26], #0x2\n" "ld1 { v28.h }[2], [x25], #0x2\n" "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v21.h }[2], [x22], #0x2\n" - "ld1 { v26.h }[2], [x21], #0x2\n" - "ld1 { v25.h }[2], [x20], #0x2\n" - "mov x19, #0x6\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x22], #0x2\n" + "ld1 { v21.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.b }[6], [x27]\n" + "mov x19, #0x7\n" "ld1 { v29.b }[6], [x26]\n" "ld1 { v28.b }[6], [x25]\n" "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v21.b }[6], [x22]\n" - "ld1 { v26.b }[6], [x21]\n" - "ld1 { v25.b }[6], [x20]\n" - "mov x19, #0x7\n" + "ld1 { v24.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x22]\n" + "ld1 { v21.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.b }[4], [x27]\n" "ld1 { v29.b }[4], [x26]\n" + "mov x19, #0x5\n" "ld1 { v28.b }[4], [x25]\n" "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v21.b }[4], [x22]\n" - "ld1 { v26.b }[4], [x21]\n" - "ld1 { v25.b }[4], [x20]\n" - "mov x19, #0x5\n" + "ld1 { v24.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x22]\n" + "ld1 { v21.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" "ldr h30, [x27], #0x2\n" "ldr h29, [x26], #0x2\n" + "mov x19, #0x2\n" "ldr h28, [x25], #0x2\n" "ldr h27, [x24], #0x2\n" - "ldr h23, [x23], #0x2\n" - "ldr h21, [x22], #0x2\n" - "ldr h26, [x21], #0x2\n" - "ldr h25, [x20], #0x2\n" - "mov x19, #0x2\n" + "ldr h24, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "ldr h21, [x21], #0x2\n" + "ldr h26, [x20], #0x2\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.b }[2], [x27]\n" + "mov x19, #0x3\n" "ld1 { v29.b }[2], [x26]\n" "ld1 { v28.b }[2], [x25]\n" "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v21.b }[2], [x22]\n" - "ld1 { v26.b }[2], [x21]\n" - "ld1 { v25.b }[2], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v24.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x22]\n" + "ld1 { v21.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" "b 9f\n" "8:" // odd_loads_1_0 "ldr b30, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr b29, [x26, #0x0]\n" "ldr b28, [x25, #0x0]\n" "ldr b27, [x24, #0x0]\n" - "ldr b23, [x23, #0x0]\n" - "ldr b21, [x22, #0x0]\n" - "ldr b26, [x21, #0x0]\n" - "ldr b25, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr b24, [x23, #0x0]\n" + "ldr b23, [x22, #0x0]\n" + "ldr b21, [x21, #0x0]\n" + "ldr b26, [x20, #0x0]\n" "9:" // Odd load end "sshll v30.8h, v30.8b, #0x0\n" + "subs x19, x19, #0x1\n" "sshll v29.8h, v29.8b, #0x0\n" "sshll v28.8h, v28.8b, #0x0\n" "sshll v27.8h, v27.8b, #0x0\n" + "sshll v24.8h, v24.8b, #0x0\n" + "zip1 v20.8h, v30.8h, v24.8h\n" "sshll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" + "zip1 v25.8h, v29.8h, v23.8h\n" "sshll v21.8h, v21.8b, #0x0\n" + "zip1 v19.8h, v28.8h, v21.8h\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "sshll v26.8h, v26.8b, #0x0\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "sshll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v17.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" + "zip2 v19.8h, v20.8h, v19.8h\n" + "zip2 v16.8h, v25.8h, v18.8h\n" "subs x19, x19, #0x1\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v19.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v23.8h, v30.8h, v23.8h\n" - "zip2 v20.8h, v28.8h, v26.8h\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "zip2 v20.8h, v28.8h, v21.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v21.8h, v24.8h, v20.8h\n" + "zip2 v23.8h, v29.8h, v23.8h\n" + "zip2 v19.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v21.8h, v17.8h\n" + "str q18, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v18.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip2 v20.8h, v24.8h, v20.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v17.8h\n" "10:" // Odds skip "saddw v0.4s, v0.4s, v1.4h\n" "str q0, [%x[out_ptr], #0x0]\n" "saddw2 v31.4s, v31.4s, v1.8h\n" "str q31, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp index bfa8989a4d..24ece9a68e 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -96,58 +96,58 @@ void interleave_block<8, 1, VLType::None, true>( "movi v1.8h, #0x0\n" "4:" // no_accumulate_16 "ldr q30, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "add x19, x19, #0x1\n" "ldr q29, [x26], #0x10\n" + "subs %x[width], %x[width], #0x8\n" "ldr q28, [x25], #0x10\n" - "prfm pldl1keep, [x26, #0x70]\n" + "cmp %x[width], #0x8\n" "ldr q27, [x24], #0x10\n" + "ldr q25, [x23], #0x10\n" + "zip1 v26.8h, v30.8h, v25.8h\n" + "ldr q21, [x22], #0x10\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "ldr q24, [x21], #0x10\n" + "ldr q23, [x20], #0x10\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q24, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q25, [x22], #0x10\n" - "zip2 v24.8h, v30.8h, v24.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q23, [x21], #0x10\n" - "zip1 v21.8h, v29.8h, v25.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q22, [x20], #0x10\n" - "zip1 v18.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "add x19, x19, #0x1\n" - "zip1 v20.8h, v26.8h, v18.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "cmp %x[width], #0x8\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v20.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v16.8h, v21.8h, v19.8h\n" "add v1.8h, v1.8h, v17.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x20]\n" + "prfm pldl1keep, [x20, #0x70]\n" "zip2 v16.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "str q16, [%x[out_ptr], #0x10]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "add v1.8h, v1.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "add v1.8h, v1.8h, v17.8h\n" - "zip2 v21.8h, v28.8h, v23.8h\n" - "zip1 v18.8h, v24.8h, v21.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v24.8h, v21.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x60]\n" "add v1.8h, v1.8h, v16.8h\n" @@ -163,140 +163,140 @@ void interleave_block<8, 1, VLType::None, true>( "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" "ldr d27, [x24], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" - "ldr d23, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d23, [x20], #0x8\n" "tbz %x[width], #1, 6f\n" "ld1 { v30.s }[2], [x27], #0x4\n" + "mov x19, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v24.s }[2], [x23], #0x4\n" - "ld1 { v25.s }[2], [x22], #0x4\n" - "ld1 { v23.s }[2], [x21], #0x4\n" - "ld1 { v22.s }[2], [x20], #0x4\n" - "mov x19, #0x6\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v24.s }[2], [x21], #0x4\n" + "ld1 { v23.s }[2], [x20], #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.h }[6], [x27]\n" + "mov x19, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" "ld1 { v27.h }[6], [x24]\n" - "ld1 { v24.h }[6], [x23]\n" - "ld1 { v25.h }[6], [x22]\n" - "ld1 { v23.h }[6], [x21]\n" - "ld1 { v22.h }[6], [x20]\n" - "mov x19, #0x7\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "ld1 { v24.h }[6], [x21]\n" + "ld1 { v23.h }[6], [x20]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.h }[4], [x27]\n" "ld1 { v29.h }[4], [x26]\n" + "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" "ld1 { v27.h }[4], [x24]\n" - "ld1 { v24.h }[4], [x23]\n" - "ld1 { v25.h }[4], [x22]\n" - "ld1 { v23.h }[4], [x21]\n" - "ld1 { v22.h }[4], [x20]\n" - "mov x19, #0x5\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "ld1 { v24.h }[4], [x21]\n" + "ld1 { v23.h }[4], [x20]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" "ldr s30, [x27], #0x4\n" "ldr s29, [x26], #0x4\n" + "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" - "ldr s23, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" - "mov x19, #0x2\n" + "ldr s25, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" + "ldr s23, [x20], #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.h }[2], [x27]\n" + "mov x19, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" "ld1 { v27.h }[2], [x24]\n" - "ld1 { v24.h }[2], [x23]\n" - "ld1 { v25.h }[2], [x22]\n" - "ld1 { v23.h }[2], [x21]\n" - "ld1 { v22.h }[2], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "ld1 { v24.h }[2], [x21]\n" + "ld1 { v23.h }[2], [x20]\n" "b 9f\n" "8:" // odd_loads_1_0 "ldr h30, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" "ldr h27, [x24, #0x0]\n" - "ldr h24, [x23, #0x0]\n" - "ldr h25, [x22, #0x0]\n" - "ldr h23, [x21, #0x0]\n" - "ldr h22, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr h25, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "ldr h24, [x21, #0x0]\n" + "ldr h23, [x20, #0x0]\n" "9:" // Odd load end - "zip1 v26.8h, v30.8h, v24.8h\n" + "zip1 v26.8h, v30.8h, v25.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v28.8h, v23.8h\n" - "zip1 v20.8h, v26.8h, v18.8h\n" - "zip1 v21.8h, v29.8h, v25.8h\n" - "zip1 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v26.8h, v20.8h\n" + "zip1 v22.8h, v29.8h, v21.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v16.8h, v22.8h, v19.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v17.8h, v20.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v26.8h, v18.8h\n" - "zip2 v16.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v26.8h, v20.8h\n" + "zip2 v17.8h, v22.8h, v19.8h\n" "subs x19, x19, #0x1\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v17.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v24.8h, v30.8h, v24.8h\n" - "zip2 v21.8h, v28.8h, v23.8h\n" + "zip2 v25.8h, v30.8h, v25.8h\n" + "zip2 v20.8h, v28.8h, v24.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v21.8h\n" - "zip2 v20.8h, v29.8h, v25.8h\n" - "zip2 v19.8h, v27.8h, v22.8h\n" - "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v18.8h, v25.8h, v20.8h\n" + "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v24.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" + "zip2 v18.8h, v25.8h, v20.8h\n" + "zip2 v17.8h, v21.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "10:" // Odds skip "uaddw v0.4s, v0.4s, v1.4h\n" "str q0, [%x[out_ptr], #0x0]\n" "uaddw2 v31.4s, v31.4s, v1.8h\n" "str q31, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp index 86b90f1898..0db2f7fd51 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,206 +79,206 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d30, [x27], #0x8\n" + "ldr d31, [x27], #0x8\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ldr d30, [x26], #0x8\n" + "subs %x[width], %x[width], #0x8\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ldr d29, [x25], #0x8\n" + "cmp %x[width], #0x8\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ldr d28, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ldr d23, [x22], #0x8\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ldr d27, [x21], #0x8\n" + "zip1 v20.8h, v31.8h, v25.8h\n" + "ldr d26, [x20], #0x8\n" + "zip2 v25.8h, v31.8h, v25.8h\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr d27, [x24], #0x8\n" + "ushll v23.8h, v23.8b, #0x0\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr d23, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" + "zip1 v24.8h, v30.8h, v23.8h\n" + "zip2 v23.8h, v30.8h, v23.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr d26, [x21], #0x8\n" - "ldr d25, [x20], #0x8\n" + "ushll v27.8h, v27.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" + "zip1 v19.8h, v29.8h, v27.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "ushll v30.8h, v30.8b, #0x0\n" - "ushll v29.8h, v29.8b, #0x0\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "ushll v28.8h, v28.8b, #0x0\n" + "zip2 v21.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x20, #0x70]\n" - "ushll v27.8h, v27.8b, #0x0\n" - "ushll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" - "ushll v21.8h, v21.8b, #0x0\n" - "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v19.8h, v29.8h, v27.8h\n" + "zip1 v20.8h, v25.8h, v19.8h\n" + "zip2 v19.8h, v25.8h, v19.8h\n" "ushll v26.8h, v26.8b, #0x0\n" - "ushll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "subs %x[width], %x[width], #0x8\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "cmp %x[width], #0x8\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v26.8h\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v19.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v19.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s23, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s26, [x21], #0x4\n" - "ldr s25, [x20], #0x4\n" + "ldr s31, [x27], #0x4\n" + "ldr s30, [x26], #0x4\n" + "ldr s29, [x25], #0x4\n" + "ldr s28, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s27, [x21], #0x4\n" + "ldr s26, [x20], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v30.h }[2], [x27], #0x2\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x25], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v21.h }[2], [x22], #0x2\n" - "ld1 { v26.h }[2], [x21], #0x2\n" - "ld1 { v25.h }[2], [x20], #0x2\n" + "ld1 { v31.h }[2], [x27], #0x2\n" "mov x19, #0x6\n" + "ld1 { v30.h }[2], [x26], #0x2\n" + "ld1 { v29.h }[2], [x25], #0x2\n" + "ld1 { v28.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x22], #0x2\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v30.b }[6], [x27]\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x25]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v21.b }[6], [x22]\n" - "ld1 { v26.b }[6], [x21]\n" - "ld1 { v25.b }[6], [x20]\n" + "ld1 { v31.b }[6], [x27]\n" "mov x19, #0x7\n" + "ld1 { v30.b }[6], [x26]\n" + "ld1 { v29.b }[6], [x25]\n" + "ld1 { v28.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x22]\n" + "ld1 { v27.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v30.b }[4], [x27]\n" - "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x25]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v21.b }[4], [x22]\n" - "ld1 { v26.b }[4], [x21]\n" - "ld1 { v25.b }[4], [x20]\n" + "ld1 { v31.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x26]\n" "mov x19, #0x5\n" + "ld1 { v29.b }[4], [x25]\n" + "ld1 { v28.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x22]\n" + "ld1 { v27.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr h30, [x27], #0x2\n" - "ldr h29, [x26], #0x2\n" - "ldr h28, [x25], #0x2\n" - "ldr h27, [x24], #0x2\n" - "ldr h23, [x23], #0x2\n" - "ldr h21, [x22], #0x2\n" - "ldr h26, [x21], #0x2\n" - "ldr h25, [x20], #0x2\n" + "ldr h31, [x27], #0x2\n" + "ldr h30, [x26], #0x2\n" "mov x19, #0x2\n" + "ldr h29, [x25], #0x2\n" + "ldr h28, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "ldr h27, [x21], #0x2\n" + "ldr h26, [x20], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v30.b }[2], [x27]\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x25]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v21.b }[2], [x22]\n" - "ld1 { v26.b }[2], [x21]\n" - "ld1 { v25.b }[2], [x20]\n" + "ld1 { v31.b }[2], [x27]\n" "mov x19, #0x3\n" + "ld1 { v30.b }[2], [x26]\n" + "ld1 { v29.b }[2], [x25]\n" + "ld1 { v28.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x22]\n" + "ld1 { v27.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr b30, [x27, #0x0]\n" - "ldr b29, [x26, #0x0]\n" - "ldr b28, [x25, #0x0]\n" - "ldr b27, [x24, #0x0]\n" - "ldr b23, [x23, #0x0]\n" - "ldr b21, [x22, #0x0]\n" - "ldr b26, [x21, #0x0]\n" - "ldr b25, [x20, #0x0]\n" + "ldr b31, [x27, #0x0]\n" "mov x19, #0x1\n" + "ldr b30, [x26, #0x0]\n" + "ldr b29, [x25, #0x0]\n" + "ldr b28, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b23, [x22, #0x0]\n" + "ldr b27, [x21, #0x0]\n" + "ldr b26, [x20, #0x0]\n" "7:" // Odd load end + "ushll v31.8h, v31.8b, #0x0\n" + "subs x19, x19, #0x1\n" "ushll v30.8h, v30.8b, #0x0\n" "ushll v29.8h, v29.8b, #0x0\n" "ushll v28.8h, v28.8b, #0x0\n" - "ushll v27.8h, v27.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "zip1 v20.8h, v31.8h, v25.8h\n" "ushll v23.8h, v23.8b, #0x0\n" "zip1 v24.8h, v30.8h, v23.8h\n" - "ushll v21.8h, v21.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "zip1 v19.8h, v29.8h, v27.8h\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "ushll v26.8h, v26.8b, #0x0\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "ushll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" + "zip2 v21.8h, v20.8h, v19.8h\n" + "zip2 v17.8h, v24.8h, v18.8h\n" "subs x19, x19, #0x1\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v23.8h, v30.8h, v23.8h\n" - "zip2 v20.8h, v28.8h, v26.8h\n" + "zip2 v25.8h, v31.8h, v25.8h\n" + "zip2 v19.8h, v29.8h, v27.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v20.8h, v25.8h, v19.8h\n" + "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v18.8h, v28.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v16.8h, v20.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v19.8h, v25.8h, v19.8h\n" + "zip2 v17.8h, v23.8h, v18.8h\n" + "zip1 v16.8h, v19.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp index cefb70c57b..7c7d774a6b 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -96,72 +96,72 @@ void interleave_block<8, 1, VLType::None, true>( "movi v1.8h, #0x0\n" "4:" // no_accumulate_16 "ldr d30, [x27], #0x8\n" - "prfm pldl1keep, [x27, #0x70]\n" + "ushll v30.8h, v30.8b, #0x0\n" "ldr d29, [x26], #0x8\n" + "add x19, x19, #0x1\n" + "ushll v29.8h, v29.8b, #0x0\n" "ldr d28, [x25], #0x8\n" - "prfm pldl1keep, [x26, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" + "ushll v28.8h, v28.8b, #0x0\n" "ldr d27, [x24], #0x8\n" + "cmp %x[width], #0x8\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ushll v24.8h, v24.8b, #0x0\n" + "ldr d21, [x21], #0x8\n" + "ushll v23.8h, v23.8b, #0x0\n" + "ldr d26, [x20], #0x8\n" + "zip1 v20.8h, v30.8h, v24.8h\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip1 v25.8h, v29.8h, v23.8h\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip2 v24.8h, v30.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr d23, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" + "zip2 v23.8h, v29.8h, v23.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr d26, [x21], #0x8\n" - "ldr d25, [x20], #0x8\n" + "ushll v21.8h, v21.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" + "zip1 v19.8h, v28.8h, v21.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "ushll v30.8h, v30.8b, #0x0\n" - "ushll v29.8h, v29.8b, #0x0\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "ushll v28.8h, v28.8b, #0x0\n" + "zip2 v19.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x20, #0x70]\n" - "ushll v27.8h, v27.8b, #0x0\n" - "ushll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" - "ushll v21.8h, v21.8b, #0x0\n" - "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v28.8h, v21.8h\n" + "zip1 v21.8h, v24.8h, v20.8h\n" + "zip2 v20.8h, v24.8h, v20.8h\n" "ushll v26.8h, v26.8b, #0x0\n" - "ushll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "add x19, x19, #0x1\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "cmp %x[width], #0x8\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x20]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v16.8h, v25.8h, v18.8h\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v16.8h, v19.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v26.8h\n" + "add v1.8h, v1.8h, v17.8h\n" + "zip2 v19.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" "add v1.8h, v1.8h, v16.8h\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v18.8h, v21.8h, v17.8h\n" + "str q18, [%x[out_ptr], #0x50]\n" "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x60]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x60]\n" + "add v1.8h, v1.8h, v18.8h\n" + "zip2 v16.8h, v20.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" + "add v1.8h, v1.8h, v17.8h\n" "add v1.8h, v1.8h, v16.8h\n" "bge 3b\n" "5:" // Main loop skip @@ -171,148 +171,148 @@ void interleave_block<8, 1, VLType::None, true>( "ldr s29, [x26], #0x4\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s23, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s26, [x21], #0x4\n" - "ldr s25, [x20], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s21, [x21], #0x4\n" + "ldr s26, [x20], #0x4\n" "tbz %x[width], #1, 6f\n" "ld1 { v30.h }[2], [x27], #0x2\n" + "mov x19, #0x6\n" "ld1 { v29.h }[2], [x26], #0x2\n" "ld1 { v28.h }[2], [x25], #0x2\n" "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v21.h }[2], [x22], #0x2\n" - "ld1 { v26.h }[2], [x21], #0x2\n" - "ld1 { v25.h }[2], [x20], #0x2\n" - "mov x19, #0x6\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x22], #0x2\n" + "ld1 { v21.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.b }[6], [x27]\n" + "mov x19, #0x7\n" "ld1 { v29.b }[6], [x26]\n" "ld1 { v28.b }[6], [x25]\n" "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v21.b }[6], [x22]\n" - "ld1 { v26.b }[6], [x21]\n" - "ld1 { v25.b }[6], [x20]\n" - "mov x19, #0x7\n" + "ld1 { v24.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x22]\n" + "ld1 { v21.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x19, #0x4\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.b }[4], [x27]\n" "ld1 { v29.b }[4], [x26]\n" + "mov x19, #0x5\n" "ld1 { v28.b }[4], [x25]\n" "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v21.b }[4], [x22]\n" - "ld1 { v26.b }[4], [x21]\n" - "ld1 { v25.b }[4], [x20]\n" - "mov x19, #0x5\n" + "ld1 { v24.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x22]\n" + "ld1 { v21.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" "ldr h30, [x27], #0x2\n" "ldr h29, [x26], #0x2\n" + "mov x19, #0x2\n" "ldr h28, [x25], #0x2\n" "ldr h27, [x24], #0x2\n" - "ldr h23, [x23], #0x2\n" - "ldr h21, [x22], #0x2\n" - "ldr h26, [x21], #0x2\n" - "ldr h25, [x20], #0x2\n" - "mov x19, #0x2\n" + "ldr h24, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "ldr h21, [x21], #0x2\n" + "ldr h26, [x20], #0x2\n" "tbz %x[width], #0, 9f\n" "ld1 { v30.b }[2], [x27]\n" + "mov x19, #0x3\n" "ld1 { v29.b }[2], [x26]\n" "ld1 { v28.b }[2], [x25]\n" "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v21.b }[2], [x22]\n" - "ld1 { v26.b }[2], [x21]\n" - "ld1 { v25.b }[2], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v24.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x22]\n" + "ld1 { v21.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" "b 9f\n" "8:" // odd_loads_1_0 "ldr b30, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr b29, [x26, #0x0]\n" "ldr b28, [x25, #0x0]\n" "ldr b27, [x24, #0x0]\n" - "ldr b23, [x23, #0x0]\n" - "ldr b21, [x22, #0x0]\n" - "ldr b26, [x21, #0x0]\n" - "ldr b25, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr b24, [x23, #0x0]\n" + "ldr b23, [x22, #0x0]\n" + "ldr b21, [x21, #0x0]\n" + "ldr b26, [x20, #0x0]\n" "9:" // Odd load end "ushll v30.8h, v30.8b, #0x0\n" + "subs x19, x19, #0x1\n" "ushll v29.8h, v29.8b, #0x0\n" "ushll v28.8h, v28.8b, #0x0\n" "ushll v27.8h, v27.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" + "zip1 v20.8h, v30.8h, v24.8h\n" "ushll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" + "zip1 v25.8h, v29.8h, v23.8h\n" "ushll v21.8h, v21.8b, #0x0\n" + "zip1 v19.8h, v28.8h, v21.8h\n" + "zip1 v22.8h, v20.8h, v19.8h\n" "ushll v26.8h, v26.8b, #0x0\n" - "zip1 v20.8h, v28.8h, v26.8h\n" - "ushll v25.8h, v25.8b, #0x0\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v24.8h, v20.8h\n" - "zip1 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v22.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v17.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v17.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v24.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" + "zip2 v19.8h, v20.8h, v19.8h\n" + "zip2 v16.8h, v25.8h, v18.8h\n" "subs x19, x19, #0x1\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip1 v17.8h, v19.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v19.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v23.8h, v30.8h, v23.8h\n" - "zip2 v20.8h, v28.8h, v26.8h\n" + "zip2 v24.8h, v30.8h, v24.8h\n" + "zip2 v20.8h, v28.8h, v21.8h\n" "subs x19, x19, #0x1\n" - "zip1 v18.8h, v23.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v25.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip1 v21.8h, v24.8h, v20.8h\n" + "zip2 v23.8h, v29.8h, v23.8h\n" + "zip2 v19.8h, v27.8h, v26.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip1 v16.8h, v21.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v21.8h, v17.8h\n" + "str q18, [%x[out_ptr], #0x0]\n" "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "add v1.8h, v1.8h, v18.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v23.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip2 v20.8h, v24.8h, v20.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" + "add v1.8h, v1.8h, v17.8h\n" "10:" // Odds skip "uaddw v0.4s, v0.4s, v1.4h\n" "str q0, [%x[out_ptr], #0x0]\n" "uaddw2 v31.4s, v31.4s, v1.8h\n" "str q31, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp index 5377edc1e1..1e5d395667 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,45 +80,45 @@ void interleave_block<8, 2, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q28, [x27], #0x10\n" + "subs %x[width], %x[width], #0x8\n" + "ldr q29, [x26], #0x10\n" + "cmp %x[width], #0x8\n" + "ldr q25, [x25], #0x10\n" + "zip1 v22.4s, v28.4s, v25.4s\n" + "ldr q21, [x24], #0x10\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "ldr q27, [x23], #0x10\n" + "ldr q26, [x22], #0x10\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "ldr q19, [x21], #0x10\n" + "zip2 v25.4s, v29.4s, v21.4s\n" + "ldr q24, [x20], #0x10\n" + "zip1 v23.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr q27, [x26], #0x10\n" - "ldr q26, [x25], #0x10\n" - "zip1 v23.4s, v28.4s, v26.4s\n" + "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr q22, [x24], #0x10\n" - "zip2 v26.4s, v28.4s, v26.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q25, [x23], #0x10\n" - "zip1 v20.4s, v27.4s, v22.4s\n" + "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q24, [x22], #0x10\n" - "zip1 v16.4s, v23.4s, v20.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v23.4s, v23.4s, v20.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v22.4s, v27.4s, v22.4s\n" - "ldr q21, [x20], #0x10\n" - "zip1 v18.4s, v25.4s, v19.4s\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v20.4s, v26.4s, v22.4s\n" + "zip2 v19.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v16.4s, v24.4s, v21.4s\n" - "subs %x[width], %x[width], #0x8\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "cmp %x[width], #0x8\n" - "zip2 v16.4s, v18.4s, v16.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v19.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "str q23, [%x[out_ptr], #0x20]\n" - "zip2 v18.4s, v24.4s, v21.4s\n" - "str q16, [%x[out_ptr], #0x30]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q20, [%x[out_ptr], #0x40]\n" - "zip2 v17.4s, v26.4s, v22.4s\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" + "str q22, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" @@ -127,119 +127,119 @@ void interleave_block<8, 2, VLType::None, false>( "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" "ldr d28, [x27], #0x8\n" - "ldr d27, [x26], #0x8\n" - "ldr d26, [x25], #0x8\n" - "ldr d22, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d24, [x22], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" "ldr d19, [x21], #0x8\n" - "ldr d21, [x20], #0x8\n" + "ldr d24, [x20], #0x8\n" "tbz %x[width], #1, 4f\n" "ld1 { v28.s }[2], [x27], #0x4\n" - "ld1 { v27.s }[2], [x26], #0x4\n" - "ld1 { v26.s }[2], [x25], #0x4\n" - "ld1 { v22.s }[2], [x24], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v24.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v21.s }[2], [x20], #0x4\n" "mov x19, #0x3\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v26.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v24.s }[2], [x20], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v28.h }[6], [x27]\n" - "ld1 { v27.h }[6], [x26]\n" - "ld1 { v26.h }[6], [x25]\n" - "ld1 { v22.h }[6], [x24]\n" - "ld1 { v25.h }[6], [x23]\n" - "ld1 { v24.h }[6], [x22]\n" - "ld1 { v19.h }[6], [x21]\n" - "ld1 { v21.h }[6], [x20]\n" "mov x19, #0x4\n" + "ld1 { v29.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v27.h }[6], [x23]\n" + "ld1 { v26.h }[6], [x22]\n" + "ld1 { v19.h }[6], [x21]\n" + "ld1 { v24.h }[6], [x20]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x19, #0x2\n" "tbz %x[width], #0, 7f\n" "ld1 { v28.h }[4], [x27]\n" - "ld1 { v27.h }[4], [x26]\n" - "ld1 { v26.h }[4], [x25]\n" - "ld1 { v22.h }[4], [x24]\n" - "ld1 { v25.h }[4], [x23]\n" - "ld1 { v24.h }[4], [x22]\n" - "ld1 { v19.h }[4], [x21]\n" - "ld1 { v21.h }[4], [x20]\n" + "ld1 { v29.h }[4], [x26]\n" "mov x19, #0x3\n" + "ld1 { v25.h }[4], [x25]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v27.h }[4], [x23]\n" + "ld1 { v26.h }[4], [x22]\n" + "ld1 { v19.h }[4], [x21]\n" + "ld1 { v24.h }[4], [x20]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" "ldr s28, [x27], #0x4\n" - "ldr s27, [x26], #0x4\n" - "ldr s26, [x25], #0x4\n" - "ldr s22, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s24, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s21, [x20], #0x4\n" + "ldr s29, [x26], #0x4\n" "mov x19, #0x1\n" + "ldr s25, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" + "ldr s24, [x20], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v28.h }[2], [x27]\n" - "ld1 { v27.h }[2], [x26]\n" - "ld1 { v26.h }[2], [x25]\n" - "ld1 { v22.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v24.h }[2], [x22]\n" - "ld1 { v19.h }[2], [x21]\n" - "ld1 { v21.h }[2], [x20]\n" "mov x19, #0x2\n" + "ld1 { v29.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v27.h }[2], [x23]\n" + "ld1 { v26.h }[2], [x22]\n" + "ld1 { v19.h }[2], [x21]\n" + "ld1 { v24.h }[2], [x20]\n" "b 7f\n" "6:" // odd_loads_1_0 "ldr h28, [x27, #0x0]\n" - "ldr h27, [x26, #0x0]\n" - "ldr h26, [x25, #0x0]\n" - "ldr h22, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h24, [x22, #0x0]\n" - "ldr h19, [x21, #0x0]\n" - "ldr h21, [x20, #0x0]\n" "mov x19, #0x1\n" + "ldr h29, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "ldr h26, [x22, #0x0]\n" + "ldr h19, [x21, #0x0]\n" + "ldr h24, [x20, #0x0]\n" "7:" // Odd load end - "zip1 v23.4s, v28.4s, v26.4s\n" + "zip1 v22.4s, v28.4s, v25.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v27.4s, v22.4s\n" - "zip1 v16.4s, v23.4s, v20.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v24.4s, v21.4s\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "zip1 v23.4s, v22.4s, v20.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "zip1 v17.4s, v18.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v23.4s, v23.4s, v20.4s\n" - "zip2 v16.4s, v18.4s, v16.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v22.4s, v20.4s\n" + "str q22, [%x[out_ptr], #0x0]\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "subs x19, x19, #0x1\n" + "str q20, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v26.4s, v28.4s, v26.4s\n" - "zip2 v22.4s, v27.4s, v22.4s\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "zip2 v25.4s, v29.4s, v21.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v26.4s, v22.4s\n" - "str q20, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "zip2 v18.4s, v24.4s, v21.4s\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip1 v21.4s, v28.4s, v25.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v27.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v17.4s, v26.4s, v22.4s\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "8:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp index 3aea6a8999..064207c0fa 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,39 +80,39 @@ void interleave_block<8, 2, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q27, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "subs %x[width], %x[width], #0x4\n" "ldr q24, [x26], #0x10\n" "zip1 v26.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q25, [x25], #0x10\n" + "cmp %x[width], #0x4\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x25, #0x70]\n" "ldr q21, [x24], #0x10\n" - "zip1 v23.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x24, #0x70]\n" - "ldr q22, [x23], #0x10\n" + "ldr q23, [x23], #0x10\n" + "zip1 v22.2d, v25.2d, v21.2d\n" + "ldr q18, [x22], #0x10\n" "zip2 v21.2d, v25.2d, v21.2d\n" + "ldr q20, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "zip1 v19.2d, v23.2d, v18.2d\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v18.2d, v23.2d, v18.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v17.2d, v20.2d, v16.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "zip2 v16.2d, v20.2d, v16.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q18, [x22], #0x10\n" - "zip1 v20.2d, v22.2d, v18.2d\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v18.2d, v22.2d, v18.2d\n" "prfm pldl1keep, [x21, #0x70]\n" - "ldr q16, [x20], #0x10\n" - "zip1 v17.2d, v19.2d, v16.2d\n" "prfm pldl1keep, [x20, #0x70]\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "str q20, [%x[out_ptr], #0x20]\n" + "str q22, [%x[out_ptr], #0x10]\n" + "str q19, [%x[out_ptr], #0x20]\n" "str q17, [%x[out_ptr], #0x30]\n" "str q24, [%x[out_ptr], #0x40]\n" "str q21, [%x[out_ptr], #0x50]\n" "str q18, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" - "subs %x[width], %x[width], #0x4\n" - "cmp %x[width], #0x4\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip @@ -120,58 +120,58 @@ void interleave_block<8, 2, VLType::None, false>( "tbz %x[width], #1, 4f\n" "ldr d27, [x27], #0x8\n" "ldr d24, [x26], #0x8\n" + "mov x19, #0x1\n" "ldr d25, [x25], #0x8\n" "ldr d21, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" + "ldr d23, [x23], #0x8\n" "ldr d18, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" + "ldr d20, [x21], #0x8\n" "ldr d16, [x20], #0x8\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 5f\n" "ld1 { v27.s }[2], [x27]\n" + "mov x19, #0x2\n" "ld1 { v24.s }[2], [x26]\n" "ld1 { v25.s }[2], [x25]\n" "ld1 { v21.s }[2], [x24]\n" - "ld1 { v22.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x23]\n" "ld1 { v18.s }[2], [x22]\n" - "ld1 { v19.s }[2], [x21]\n" + "ld1 { v20.s }[2], [x21]\n" "ld1 { v16.s }[2], [x20]\n" - "mov x19, #0x2\n" "b 5f\n" "4:" // odd_loads_1_0 "ldr s27, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr s24, [x26, #0x0]\n" "ldr s25, [x25, #0x0]\n" "ldr s21, [x24, #0x0]\n" - "ldr s22, [x23, #0x0]\n" + "ldr s23, [x23, #0x0]\n" "ldr s18, [x22, #0x0]\n" - "ldr s19, [x21, #0x0]\n" + "ldr s20, [x21, #0x0]\n" "ldr s16, [x20, #0x0]\n" - "mov x19, #0x1\n" "5:" // Odd load end "zip1 v26.2d, v27.2d, v24.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v23.2d, v25.2d, v21.2d\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v19.2d, v16.2d\n" - "str q20, [%x[out_ptr], #0x20]\n" + "zip1 v22.2d, v25.2d, v21.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v19.2d, v23.2d, v18.2d\n" + "str q22, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v20.2d, v16.2d\n" + "str q19, [%x[out_ptr], #0x20]\n" "str q17, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 6f\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "zip2 v21.2d, v25.2d, v21.2d\n" "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v22.2d, v18.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "zip2 v18.2d, v23.2d, v18.2d\n" "str q21, [%x[out_ptr], #0x10]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" + "zip2 v16.2d, v20.2d, v16.2d\n" "str q18, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "6:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp index 4780b77a4a..1f86722bc1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,39 +80,39 @@ void interleave_block<8, 4, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q27, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "subs %x[width], %x[width], #0x8\n" "ldr q24, [x26], #0x10\n" "zip1 v26.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q25, [x25], #0x10\n" + "cmp %x[width], #0x8\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x25, #0x70]\n" "ldr q21, [x24], #0x10\n" - "zip1 v23.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x24, #0x70]\n" - "ldr q22, [x23], #0x10\n" + "ldr q23, [x23], #0x10\n" + "zip1 v22.2d, v25.2d, v21.2d\n" + "ldr q18, [x22], #0x10\n" "zip2 v21.2d, v25.2d, v21.2d\n" + "ldr q20, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "zip1 v19.2d, v23.2d, v18.2d\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v18.2d, v23.2d, v18.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v17.2d, v20.2d, v16.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "zip2 v16.2d, v20.2d, v16.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q18, [x22], #0x10\n" - "zip1 v20.2d, v22.2d, v18.2d\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v18.2d, v22.2d, v18.2d\n" "prfm pldl1keep, [x21, #0x70]\n" - "ldr q16, [x20], #0x10\n" - "zip1 v17.2d, v19.2d, v16.2d\n" "prfm pldl1keep, [x20, #0x70]\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "str q20, [%x[out_ptr], #0x20]\n" + "str q22, [%x[out_ptr], #0x10]\n" + "str q19, [%x[out_ptr], #0x20]\n" "str q17, [%x[out_ptr], #0x30]\n" "str q24, [%x[out_ptr], #0x40]\n" "str q21, [%x[out_ptr], #0x50]\n" "str q18, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" - "subs %x[width], %x[width], #0x8\n" - "cmp %x[width], #0x8\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip @@ -122,28 +122,28 @@ void interleave_block<8, 4, VLType::None, false>( "ldr d24, [x26], #0x8\n" "ldr d25, [x25], #0x8\n" "ldr d21, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" + "ldr d23, [x23], #0x8\n" "ldr d18, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" + "ldr d20, [x21], #0x8\n" "ldr d16, [x20], #0x8\n" "tbz %x[width], #1, 4f\n" "ld1 { v27.s }[2], [x27], #0x4\n" + "mov x19, #0x2\n" "ld1 { v24.s }[2], [x26], #0x4\n" "ld1 { v25.s }[2], [x25], #0x4\n" "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" "ld1 { v18.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v20.s }[2], [x21], #0x4\n" "ld1 { v16.s }[2], [x20], #0x4\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 7f\n" "ld1 { v27.h }[6], [x27]\n" "ld1 { v24.h }[6], [x26]\n" "ld1 { v25.h }[6], [x25]\n" "ld1 { v21.h }[6], [x24]\n" - "ld1 { v22.h }[6], [x23]\n" + "ld1 { v23.h }[6], [x23]\n" "ld1 { v18.h }[6], [x22]\n" - "ld1 { v19.h }[6], [x21]\n" + "ld1 { v20.h }[6], [x21]\n" "ld1 { v16.h }[6], [x20]\n" "b 7f\n" "4:" // odd_loads_1_4 @@ -151,69 +151,69 @@ void interleave_block<8, 4, VLType::None, false>( "tbz %x[width], #0, 7f\n" "ld1 { v27.h }[4], [x27]\n" "ld1 { v24.h }[4], [x26]\n" + "mov x19, #0x2\n" "ld1 { v25.h }[4], [x25]\n" "ld1 { v21.h }[4], [x24]\n" - "ld1 { v22.h }[4], [x23]\n" + "ld1 { v23.h }[4], [x23]\n" "ld1 { v18.h }[4], [x22]\n" - "ld1 { v19.h }[4], [x21]\n" + "ld1 { v20.h }[4], [x21]\n" "ld1 { v16.h }[4], [x20]\n" - "mov x19, #0x2\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" "ldr s27, [x27], #0x4\n" "ldr s24, [x26], #0x4\n" + "mov x19, #0x1\n" "ldr s25, [x25], #0x4\n" "ldr s21, [x24], #0x4\n" - "ldr s22, [x23], #0x4\n" + "ldr s23, [x23], #0x4\n" "ldr s18, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" + "ldr s20, [x21], #0x4\n" "ldr s16, [x20], #0x4\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 7f\n" "ld1 { v27.h }[2], [x27]\n" "ld1 { v24.h }[2], [x26]\n" "ld1 { v25.h }[2], [x25]\n" "ld1 { v21.h }[2], [x24]\n" - "ld1 { v22.h }[2], [x23]\n" + "ld1 { v23.h }[2], [x23]\n" "ld1 { v18.h }[2], [x22]\n" - "ld1 { v19.h }[2], [x21]\n" + "ld1 { v20.h }[2], [x21]\n" "ld1 { v16.h }[2], [x20]\n" "b 7f\n" "6:" // odd_loads_1_0 "ldr h27, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr h24, [x26, #0x0]\n" "ldr h25, [x25, #0x0]\n" "ldr h21, [x24, #0x0]\n" - "ldr h22, [x23, #0x0]\n" + "ldr h23, [x23, #0x0]\n" "ldr h18, [x22, #0x0]\n" - "ldr h19, [x21, #0x0]\n" + "ldr h20, [x21, #0x0]\n" "ldr h16, [x20, #0x0]\n" - "mov x19, #0x1\n" "7:" // Odd load end "zip1 v26.2d, v27.2d, v24.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v23.2d, v25.2d, v21.2d\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v19.2d, v16.2d\n" - "str q20, [%x[out_ptr], #0x20]\n" + "zip1 v22.2d, v25.2d, v21.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v19.2d, v23.2d, v18.2d\n" + "str q22, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v20.2d, v16.2d\n" + "str q19, [%x[out_ptr], #0x20]\n" "str q17, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 8f\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "zip2 v21.2d, v25.2d, v21.2d\n" "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v22.2d, v18.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "zip2 v18.2d, v23.2d, v18.2d\n" "str q21, [%x[out_ptr], #0x10]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" + "zip2 v16.2d, v20.2d, v16.2d\n" "str q18, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "8:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp index a9034f5742..659d9947e2 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,45 +80,45 @@ void interleave_block<8, 4, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q28, [x27], #0x10\n" + "subs %x[width], %x[width], #0x10\n" + "ldr q29, [x26], #0x10\n" + "cmp %x[width], #0x10\n" + "ldr q25, [x25], #0x10\n" + "zip1 v22.4s, v28.4s, v25.4s\n" + "ldr q21, [x24], #0x10\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "ldr q27, [x23], #0x10\n" + "ldr q26, [x22], #0x10\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "ldr q19, [x21], #0x10\n" + "zip2 v25.4s, v29.4s, v21.4s\n" + "ldr q24, [x20], #0x10\n" + "zip1 v23.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr q27, [x26], #0x10\n" - "ldr q26, [x25], #0x10\n" - "zip1 v23.4s, v28.4s, v26.4s\n" + "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr q22, [x24], #0x10\n" - "zip2 v26.4s, v28.4s, v26.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q25, [x23], #0x10\n" - "zip1 v20.4s, v27.4s, v22.4s\n" + "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q24, [x22], #0x10\n" - "zip1 v16.4s, v23.4s, v20.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v23.4s, v23.4s, v20.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v22.4s, v27.4s, v22.4s\n" - "ldr q21, [x20], #0x10\n" - "zip1 v18.4s, v25.4s, v19.4s\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v20.4s, v26.4s, v22.4s\n" + "zip2 v19.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v16.4s, v24.4s, v21.4s\n" - "subs %x[width], %x[width], #0x10\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "cmp %x[width], #0x10\n" - "zip2 v16.4s, v18.4s, v16.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v19.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "str q23, [%x[out_ptr], #0x20]\n" - "zip2 v18.4s, v24.4s, v21.4s\n" - "str q16, [%x[out_ptr], #0x30]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q20, [%x[out_ptr], #0x40]\n" - "zip2 v17.4s, v26.4s, v22.4s\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" + "str q22, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" @@ -127,203 +127,203 @@ void interleave_block<8, 4, VLType::None, false>( "cbz %x[width], 12f\n" "tbz %x[width], #3, 7f\n" "ldr d28, [x27], #0x8\n" - "ldr d27, [x26], #0x8\n" - "ldr d26, [x25], #0x8\n" - "ldr d22, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d24, [x22], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" "ldr d19, [x21], #0x8\n" - "ldr d21, [x20], #0x8\n" + "ldr d24, [x20], #0x8\n" "tbz %x[width], #2, 5f\n" "ld1 { v28.s }[2], [x27], #0x4\n" - "ld1 { v27.s }[2], [x26], #0x4\n" - "ld1 { v26.s }[2], [x25], #0x4\n" - "ld1 { v22.s }[2], [x24], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v26.s }[2], [x22], #0x4\n" "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v21.s }[2], [x20], #0x4\n" + "ld1 { v24.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 4f\n" "ld1 { v28.h }[6], [x27], #0x2\n" - "ld1 { v27.h }[6], [x26], #0x2\n" - "ld1 { v26.h }[6], [x25], #0x2\n" - "ld1 { v22.h }[6], [x24], #0x2\n" - "ld1 { v25.h }[6], [x23], #0x2\n" - "ld1 { v24.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v21.h }[6], [x20], #0x2\n" "mov x19, #0x4\n" + "ld1 { v29.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v27.h }[6], [x23], #0x2\n" + "ld1 { v26.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v24.h }[6], [x20], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[14], [x27]\n" - "ld1 { v27.b }[14], [x26]\n" - "ld1 { v26.b }[14], [x25]\n" - "ld1 { v22.b }[14], [x24]\n" - "ld1 { v25.b }[14], [x23]\n" - "ld1 { v24.b }[14], [x22]\n" + "ld1 { v29.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v27.b }[14], [x23]\n" + "ld1 { v26.b }[14], [x22]\n" "ld1 { v19.b }[14], [x21]\n" - "ld1 { v21.b }[14], [x20]\n" + "ld1 { v24.b }[14], [x20]\n" "b 11f\n" "4:" // odd_loads_1_12 "mov x19, #0x3\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[12], [x27]\n" - "ld1 { v27.b }[12], [x26]\n" - "ld1 { v26.b }[12], [x25]\n" - "ld1 { v22.b }[12], [x24]\n" - "ld1 { v25.b }[12], [x23]\n" - "ld1 { v24.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" - "ld1 { v21.b }[12], [x20]\n" + "ld1 { v29.b }[12], [x26]\n" "mov x19, #0x4\n" + "ld1 { v25.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v27.b }[12], [x23]\n" + "ld1 { v26.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" + "ld1 { v24.b }[12], [x20]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" "ld1 { v28.h }[4], [x27], #0x2\n" - "ld1 { v27.h }[4], [x26], #0x2\n" - "ld1 { v26.h }[4], [x25], #0x2\n" - "ld1 { v22.h }[4], [x24], #0x2\n" - "ld1 { v25.h }[4], [x23], #0x2\n" - "ld1 { v24.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v21.h }[4], [x20], #0x2\n" + "ld1 { v29.h }[4], [x26], #0x2\n" "mov x19, #0x3\n" + "ld1 { v25.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v27.h }[4], [x23], #0x2\n" + "ld1 { v26.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v24.h }[4], [x20], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[10], [x27]\n" - "ld1 { v27.b }[10], [x26]\n" - "ld1 { v26.b }[10], [x25]\n" - "ld1 { v22.b }[10], [x24]\n" - "ld1 { v25.b }[10], [x23]\n" - "ld1 { v24.b }[10], [x22]\n" + "ld1 { v29.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v27.b }[10], [x23]\n" + "ld1 { v26.b }[10], [x22]\n" "ld1 { v19.b }[10], [x21]\n" - "ld1 { v21.b }[10], [x20]\n" + "ld1 { v24.b }[10], [x20]\n" "b 11f\n" "6:" // odd_loads_1_8 "mov x19, #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[8], [x27]\n" - "ld1 { v27.b }[8], [x26]\n" - "ld1 { v26.b }[8], [x25]\n" - "ld1 { v22.b }[8], [x24]\n" - "ld1 { v25.b }[8], [x23]\n" - "ld1 { v24.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" - "ld1 { v21.b }[8], [x20]\n" + "ld1 { v29.b }[8], [x26]\n" "mov x19, #0x3\n" + "ld1 { v25.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v27.b }[8], [x23]\n" + "ld1 { v26.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" + "ld1 { v24.b }[8], [x20]\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" "ldr s28, [x27], #0x4\n" - "ldr s27, [x26], #0x4\n" - "ldr s26, [x25], #0x4\n" - "ldr s22, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s24, [x22], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" "ldr s19, [x21], #0x4\n" - "ldr s21, [x20], #0x4\n" + "ldr s24, [x20], #0x4\n" "tbz %x[width], #1, 8f\n" "ld1 { v28.h }[2], [x27], #0x2\n" - "ld1 { v27.h }[2], [x26], #0x2\n" - "ld1 { v26.h }[2], [x25], #0x2\n" - "ld1 { v22.h }[2], [x24], #0x2\n" - "ld1 { v25.h }[2], [x23], #0x2\n" - "ld1 { v24.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v21.h }[2], [x20], #0x2\n" "mov x19, #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "ld1 { v26.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v24.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[6], [x27]\n" - "ld1 { v27.b }[6], [x26]\n" - "ld1 { v26.b }[6], [x25]\n" - "ld1 { v22.b }[6], [x24]\n" - "ld1 { v25.b }[6], [x23]\n" - "ld1 { v24.b }[6], [x22]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v27.b }[6], [x23]\n" + "ld1 { v26.b }[6], [x22]\n" "ld1 { v19.b }[6], [x21]\n" - "ld1 { v21.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x20]\n" "b 11f\n" "8:" // odd_loads_1_4 "mov x19, #0x1\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[4], [x27]\n" - "ld1 { v27.b }[4], [x26]\n" - "ld1 { v26.b }[4], [x25]\n" - "ld1 { v22.b }[4], [x24]\n" - "ld1 { v25.b }[4], [x23]\n" - "ld1 { v24.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" - "ld1 { v21.b }[4], [x20]\n" + "ld1 { v29.b }[4], [x26]\n" "mov x19, #0x2\n" + "ld1 { v25.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v27.b }[4], [x23]\n" + "ld1 { v26.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" + "ld1 { v24.b }[4], [x20]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" "ldr h28, [x27], #0x2\n" - "ldr h27, [x26], #0x2\n" - "ldr h26, [x25], #0x2\n" - "ldr h22, [x24], #0x2\n" - "ldr h25, [x23], #0x2\n" - "ldr h24, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" - "ldr h21, [x20], #0x2\n" + "ldr h29, [x26], #0x2\n" "mov x19, #0x1\n" + "ldr h25, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h27, [x23], #0x2\n" + "ldr h26, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" + "ldr h24, [x20], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[2], [x27]\n" - "ld1 { v27.b }[2], [x26]\n" - "ld1 { v26.b }[2], [x25]\n" - "ld1 { v22.b }[2], [x24]\n" - "ld1 { v25.b }[2], [x23]\n" - "ld1 { v24.b }[2], [x22]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v27.b }[2], [x23]\n" + "ld1 { v26.b }[2], [x22]\n" "ld1 { v19.b }[2], [x21]\n" - "ld1 { v21.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x20]\n" "b 11f\n" "10:" // odd_loads_1_0 "ldr b28, [x27, #0x0]\n" - "ldr b27, [x26, #0x0]\n" - "ldr b26, [x25, #0x0]\n" - "ldr b22, [x24, #0x0]\n" - "ldr b25, [x23, #0x0]\n" - "ldr b24, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" - "ldr b21, [x20, #0x0]\n" "mov x19, #0x1\n" + "ldr b29, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b27, [x23, #0x0]\n" + "ldr b26, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" + "ldr b24, [x20, #0x0]\n" "11:" // Odd load end - "zip1 v23.4s, v28.4s, v26.4s\n" + "zip1 v22.4s, v28.4s, v25.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v27.4s, v22.4s\n" - "zip1 v16.4s, v23.4s, v20.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v24.4s, v21.4s\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "zip1 v23.4s, v22.4s, v20.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "zip1 v17.4s, v18.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v23.4s, v23.4s, v20.4s\n" - "zip2 v16.4s, v18.4s, v16.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v22.4s, v20.4s\n" + "str q22, [%x[out_ptr], #0x0]\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "subs x19, x19, #0x1\n" + "str q20, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v26.4s, v28.4s, v26.4s\n" - "zip2 v22.4s, v27.4s, v22.4s\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "zip2 v25.4s, v29.4s, v21.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v26.4s, v22.4s\n" - "str q20, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "zip2 v18.4s, v24.4s, v21.4s\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip1 v21.4s, v28.4s, v25.4s\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v19.4s, v27.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "zip1 v18.4s, v19.4s, v16.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v17.4s, v26.4s, v22.4s\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "12:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp index 2831cb79a6..dfec94c952 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -97,270 +97,270 @@ void interleave_block<8, 4, VLType::None, true>( "movi v0.8h, #0x0\n" "mov x19, #0x0\n" "4:" // no_accumulate_16 - "ldr q29, [x27], #0x10\n" + "ldr q28, [x27], #0x10\n" + "add x19, x19, #0x1\n" + "ldr q29, [x26], #0x10\n" + "subs %x[width], %x[width], #0x10\n" + "ldr q25, [x25], #0x10\n" + "zip1 v22.4s, v28.4s, v25.4s\n" + "ldr q21, [x24], #0x10\n" + "cmp %x[width], #0x10\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "ldr q27, [x23], #0x10\n" + "ldr q26, [x22], #0x10\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "ldr q19, [x21], #0x10\n" + "zip2 v25.4s, v29.4s, v21.4s\n" + "ldr q24, [x20], #0x10\n" + "zip1 v23.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr q28, [x26], #0x10\n" - "ldr q27, [x25], #0x10\n" - "zip1 v23.4s, v29.4s, v27.4s\n" + "sadalp v1.8h, v23.16b\n" + "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr q21, [x24], #0x10\n" - "zip2 v27.4s, v29.4s, v27.4s\n" + "sadalp v1.8h, v22.16b\n" + "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q26, [x23], #0x10\n" - "zip1 v20.4s, v28.4s, v21.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q25, [x22], #0x10\n" - "zip1 v16.4s, v23.4s, v20.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v24.4s, v23.4s, v20.4s\n" + "sadalp v1.8h, v21.16b\n" + "zip1 v17.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v23.4s, v28.4s, v21.4s\n" - "ldr q22, [x20], #0x10\n" - "zip1 v18.4s, v26.4s, v19.4s\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v21.4s, v27.4s, v23.4s\n" + "sadalp v0.8h, v17.16b\n" + "zip2 v19.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v17.4s, v25.4s, v22.4s\n" - "sadalp v1.8h, v16.16b\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "add x19, x19, #0x1\n" - "zip2 v20.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v19.4s, v26.4s, v19.4s\n" - "sadalp v0.8h, v16.16b\n" - "zip2 v16.4s, v25.4s, v22.4s\n" - "str q24, [%x[out_ptr], #0x20]\n" + "sadalp v0.8h, v20.16b\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" "zip1 v18.4s, v19.4s, v16.4s\n" - "sadalp v1.8h, v24.16b\n" - "zip2 v17.4s, v27.4s, v23.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" + "str q17, [%x[out_ptr], #0x10]\n" + "sadalp v0.8h, v18.16b\n" + "zip2 v17.4s, v28.4s, v25.4s\n" + "str q22, [%x[out_ptr], #0x20]\n" "zip2 v16.4s, v19.4s, v16.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "sadalp v1.8h, v17.16b\n" "str q21, [%x[out_ptr], #0x40]\n" + "sadalp v0.8h, v16.16b\n" "str q18, [%x[out_ptr], #0x50]\n" - "sadalp v0.8h, v20.16b\n" "str q17, [%x[out_ptr], #0x60]\n" - "sadalp v1.8h, v21.16b\n" "str q16, [%x[out_ptr], #0x70]\n" - "subs %x[width], %x[width], #0x10\n" - "sadalp v0.8h, v18.16b\n" - "cmp %x[width], #0x10\n" - "sadalp v1.8h, v17.16b\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" - "sadalp v0.8h, v16.16b\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d29, [x27], #0x8\n" - "ldr d28, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" "ldr d21, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" "ldr d19, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" + "ldr d24, [x20], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v29.s }[2], [x27], #0x4\n" - "ld1 { v28.s }[2], [x26], #0x4\n" - "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v28.s }[2], [x27], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" - "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v26.s }[2], [x22], #0x4\n" "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v22.s }[2], [x20], #0x4\n" + "ld1 { v24.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v29.h }[6], [x27], #0x2\n" - "ld1 { v28.h }[6], [x26], #0x2\n" - "ld1 { v27.h }[6], [x25], #0x2\n" + "ld1 { v28.h }[6], [x27], #0x2\n" + "mov x19, #0x4\n" + "ld1 { v29.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v26.h }[6], [x23], #0x2\n" - "ld1 { v25.h }[6], [x22], #0x2\n" + "ld1 { v27.h }[6], [x23], #0x2\n" + "ld1 { v26.h }[6], [x22], #0x2\n" "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v22.h }[6], [x20], #0x2\n" - "mov x19, #0x4\n" + "ld1 { v24.h }[6], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[14], [x27]\n" - "ld1 { v28.b }[14], [x26]\n" - "ld1 { v27.b }[14], [x25]\n" + "ld1 { v28.b }[14], [x27]\n" + "ld1 { v29.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" "ld1 { v21.b }[14], [x24]\n" - "ld1 { v26.b }[14], [x23]\n" - "ld1 { v25.b }[14], [x22]\n" + "ld1 { v27.b }[14], [x23]\n" + "ld1 { v26.b }[14], [x22]\n" "ld1 { v19.b }[14], [x21]\n" - "ld1 { v22.b }[14], [x20]\n" + "ld1 { v24.b }[14], [x20]\n" "b 13f\n" "6:" // odd_loads_1_12 "mov x19, #0x3\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[12], [x27]\n" - "ld1 { v28.b }[12], [x26]\n" - "ld1 { v27.b }[12], [x25]\n" + "ld1 { v28.b }[12], [x27]\n" + "ld1 { v29.b }[12], [x26]\n" + "mov x19, #0x4\n" + "ld1 { v25.b }[12], [x25]\n" "ld1 { v21.b }[12], [x24]\n" - "ld1 { v26.b }[12], [x23]\n" - "ld1 { v25.b }[12], [x22]\n" + "ld1 { v27.b }[12], [x23]\n" + "ld1 { v26.b }[12], [x22]\n" "ld1 { v19.b }[12], [x21]\n" - "ld1 { v22.b }[12], [x20]\n" - "mov x19, #0x4\n" + "ld1 { v24.b }[12], [x20]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v29.h }[4], [x27], #0x2\n" - "ld1 { v28.h }[4], [x26], #0x2\n" - "ld1 { v27.h }[4], [x25], #0x2\n" + "ld1 { v28.h }[4], [x27], #0x2\n" + "ld1 { v29.h }[4], [x26], #0x2\n" + "mov x19, #0x3\n" + "ld1 { v25.h }[4], [x25], #0x2\n" "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v26.h }[4], [x23], #0x2\n" - "ld1 { v25.h }[4], [x22], #0x2\n" + "ld1 { v27.h }[4], [x23], #0x2\n" + "ld1 { v26.h }[4], [x22], #0x2\n" "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v22.h }[4], [x20], #0x2\n" - "mov x19, #0x3\n" + "ld1 { v24.h }[4], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[10], [x27]\n" - "ld1 { v28.b }[10], [x26]\n" - "ld1 { v27.b }[10], [x25]\n" + "ld1 { v28.b }[10], [x27]\n" + "ld1 { v29.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" "ld1 { v21.b }[10], [x24]\n" - "ld1 { v26.b }[10], [x23]\n" - "ld1 { v25.b }[10], [x22]\n" + "ld1 { v27.b }[10], [x23]\n" + "ld1 { v26.b }[10], [x22]\n" "ld1 { v19.b }[10], [x21]\n" - "ld1 { v22.b }[10], [x20]\n" + "ld1 { v24.b }[10], [x20]\n" "b 13f\n" "8:" // odd_loads_1_8 "mov x19, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[8], [x27]\n" - "ld1 { v28.b }[8], [x26]\n" - "ld1 { v27.b }[8], [x25]\n" + "ld1 { v28.b }[8], [x27]\n" + "ld1 { v29.b }[8], [x26]\n" + "mov x19, #0x3\n" + "ld1 { v25.b }[8], [x25]\n" "ld1 { v21.b }[8], [x24]\n" - "ld1 { v26.b }[8], [x23]\n" - "ld1 { v25.b }[8], [x22]\n" + "ld1 { v27.b }[8], [x23]\n" + "ld1 { v26.b }[8], [x22]\n" "ld1 { v19.b }[8], [x21]\n" - "ld1 { v22.b }[8], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v24.b }[8], [x20]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s29, [x27], #0x4\n" - "ldr s28, [x26], #0x4\n" - "ldr s27, [x25], #0x4\n" + "ldr s28, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" "ldr s21, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" "ldr s19, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" + "ldr s24, [x20], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v29.h }[2], [x27], #0x2\n" - "ld1 { v28.h }[2], [x26], #0x2\n" - "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v28.h }[2], [x27], #0x2\n" + "mov x19, #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v26.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "ld1 { v26.h }[2], [x22], #0x2\n" "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v22.h }[2], [x20], #0x2\n" - "mov x19, #0x2\n" + "ld1 { v24.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[6], [x27]\n" - "ld1 { v28.b }[6], [x26]\n" - "ld1 { v27.b }[6], [x25]\n" + "ld1 { v28.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" "ld1 { v21.b }[6], [x24]\n" - "ld1 { v26.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" + "ld1 { v27.b }[6], [x23]\n" + "ld1 { v26.b }[6], [x22]\n" "ld1 { v19.b }[6], [x21]\n" - "ld1 { v22.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x20]\n" "b 13f\n" "10:" // odd_loads_1_4 "mov x19, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[4], [x27]\n" - "ld1 { v28.b }[4], [x26]\n" - "ld1 { v27.b }[4], [x25]\n" + "ld1 { v28.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "mov x19, #0x2\n" + "ld1 { v25.b }[4], [x25]\n" "ld1 { v21.b }[4], [x24]\n" - "ld1 { v26.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" + "ld1 { v27.b }[4], [x23]\n" + "ld1 { v26.b }[4], [x22]\n" "ld1 { v19.b }[4], [x21]\n" - "ld1 { v22.b }[4], [x20]\n" - "mov x19, #0x2\n" + "ld1 { v24.b }[4], [x20]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h29, [x27], #0x2\n" - "ldr h28, [x26], #0x2\n" - "ldr h27, [x25], #0x2\n" + "ldr h28, [x27], #0x2\n" + "ldr h29, [x26], #0x2\n" + "mov x19, #0x1\n" + "ldr h25, [x25], #0x2\n" "ldr h21, [x24], #0x2\n" - "ldr h26, [x23], #0x2\n" - "ldr h25, [x22], #0x2\n" + "ldr h27, [x23], #0x2\n" + "ldr h26, [x22], #0x2\n" "ldr h19, [x21], #0x2\n" - "ldr h22, [x20], #0x2\n" - "mov x19, #0x1\n" + "ldr h24, [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[2], [x27]\n" - "ld1 { v28.b }[2], [x26]\n" - "ld1 { v27.b }[2], [x25]\n" + "ld1 { v28.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" "ld1 { v21.b }[2], [x24]\n" - "ld1 { v26.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" + "ld1 { v27.b }[2], [x23]\n" + "ld1 { v26.b }[2], [x22]\n" "ld1 { v19.b }[2], [x21]\n" - "ld1 { v22.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x20]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b29, [x27, #0x0]\n" - "ldr b28, [x26, #0x0]\n" - "ldr b27, [x25, #0x0]\n" + "ldr b28, [x27, #0x0]\n" + "mov x19, #0x1\n" + "ldr b29, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" "ldr b21, [x24, #0x0]\n" - "ldr b26, [x23, #0x0]\n" - "ldr b25, [x22, #0x0]\n" + "ldr b27, [x23, #0x0]\n" + "ldr b26, [x22, #0x0]\n" "ldr b19, [x21, #0x0]\n" - "ldr b22, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr b24, [x20, #0x0]\n" "13:" // Odd load end - "zip1 v23.4s, v29.4s, v27.4s\n" + "zip1 v22.4s, v28.4s, v25.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v28.4s, v21.4s\n" - "zip1 v16.4s, v23.4s, v20.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v26.4s, v19.4s\n" - "sadalp v1.8h, v16.16b\n" - "zip1 v17.4s, v25.4s, v22.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" - "sadalp v0.8h, v16.16b\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "zip1 v23.4s, v22.4s, v20.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "sadalp v1.8h, v23.16b\n" + "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "sadalp v0.8h, v17.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v24.4s, v23.4s, v20.4s\n" - "zip2 v20.4s, v18.4s, v17.4s\n" - "str q24, [%x[out_ptr], #0x0]\n" - "sadalp v1.8h, v24.16b\n" + "zip2 v22.4s, v22.4s, v20.4s\n" + "str q22, [%x[out_ptr], #0x0]\n" + "zip2 v20.4s, v18.4s, v16.4s\n" + "sadalp v1.8h, v22.16b\n" "str q20, [%x[out_ptr], #0x10]\n" - "sadalp v0.8h, v20.16b\n" "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" + "sadalp v0.8h, v20.16b\n" "beq 14f\n" - "zip2 v27.4s, v29.4s, v27.4s\n" - "zip2 v23.4s, v28.4s, v21.4s\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "zip2 v25.4s, v29.4s, v21.4s\n" "subs x19, x19, #0x1\n" - "zip1 v21.4s, v27.4s, v23.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v26.4s, v19.4s\n" "sadalp v1.8h, v21.16b\n" - "zip2 v16.4s, v25.4s, v22.4s\n" + "zip2 v19.4s, v27.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" "zip1 v18.4s, v19.4s, v16.4s\n" "str q18, [%x[out_ptr], #0x10]\n" "sadalp v0.8h, v18.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v17.4s, v27.4s, v23.4s\n" - "zip2 v16.4s, v19.4s, v16.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" "sadalp v1.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x10]\n" - "sadalp v0.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" + "sadalp v0.8h, v16.16b\n" "14:" // Odds skip "sadalp v31.4s, v1.8h\n" - "sadalp v30.4s, v0.8h\n" "str q31, [%x[out_ptr], #0x0]\n" + "sadalp v30.4s, v0.8h\n" "str q30, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp index 7c7857bcd0..1b94c7f1f1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -97,270 +97,270 @@ void interleave_block<8, 4, VLType::None, true>( "movi v0.8h, #0x0\n" "mov x19, #0x0\n" "4:" // no_accumulate_16 - "ldr q29, [x27], #0x10\n" + "ldr q28, [x27], #0x10\n" + "add x19, x19, #0x1\n" + "ldr q29, [x26], #0x10\n" + "subs %x[width], %x[width], #0x10\n" + "ldr q25, [x25], #0x10\n" + "zip1 v22.4s, v28.4s, v25.4s\n" + "ldr q21, [x24], #0x10\n" + "cmp %x[width], #0x10\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "ldr q27, [x23], #0x10\n" + "ldr q26, [x22], #0x10\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "ldr q19, [x21], #0x10\n" + "zip2 v25.4s, v29.4s, v21.4s\n" + "ldr q24, [x20], #0x10\n" + "zip1 v23.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x27, #0x70]\n" - "ldr q28, [x26], #0x10\n" - "ldr q27, [x25], #0x10\n" - "zip1 v23.4s, v29.4s, v27.4s\n" + "uadalp v1.8h, v23.16b\n" + "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "ldr q21, [x24], #0x10\n" - "zip2 v27.4s, v29.4s, v27.4s\n" + "uadalp v1.8h, v22.16b\n" + "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "ldr q26, [x23], #0x10\n" - "zip1 v20.4s, v28.4s, v21.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "ldr q25, [x22], #0x10\n" - "zip1 v16.4s, v23.4s, v20.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v24.4s, v23.4s, v20.4s\n" + "uadalp v1.8h, v21.16b\n" + "zip1 v17.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v23.4s, v28.4s, v21.4s\n" - "ldr q22, [x20], #0x10\n" - "zip1 v18.4s, v26.4s, v19.4s\n" + "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v21.4s, v27.4s, v23.4s\n" + "uadalp v0.8h, v17.16b\n" + "zip2 v19.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v17.4s, v25.4s, v22.4s\n" - "uadalp v1.8h, v16.16b\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "add x19, x19, #0x1\n" - "zip2 v20.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v19.4s, v26.4s, v19.4s\n" - "uadalp v0.8h, v16.16b\n" - "zip2 v16.4s, v25.4s, v22.4s\n" - "str q24, [%x[out_ptr], #0x20]\n" + "uadalp v0.8h, v20.16b\n" + "zip2 v16.4s, v26.4s, v24.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" "zip1 v18.4s, v19.4s, v16.4s\n" - "uadalp v1.8h, v24.16b\n" - "zip2 v17.4s, v27.4s, v23.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" + "str q17, [%x[out_ptr], #0x10]\n" + "uadalp v0.8h, v18.16b\n" + "zip2 v17.4s, v28.4s, v25.4s\n" + "str q22, [%x[out_ptr], #0x20]\n" "zip2 v16.4s, v19.4s, v16.4s\n" + "str q20, [%x[out_ptr], #0x30]\n" + "uadalp v1.8h, v17.16b\n" "str q21, [%x[out_ptr], #0x40]\n" + "uadalp v0.8h, v16.16b\n" "str q18, [%x[out_ptr], #0x50]\n" - "uadalp v0.8h, v20.16b\n" "str q17, [%x[out_ptr], #0x60]\n" - "uadalp v1.8h, v21.16b\n" "str q16, [%x[out_ptr], #0x70]\n" - "subs %x[width], %x[width], #0x10\n" - "uadalp v0.8h, v18.16b\n" - "cmp %x[width], #0x10\n" - "uadalp v1.8h, v17.16b\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" - "uadalp v0.8h, v16.16b\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d29, [x27], #0x8\n" - "ldr d28, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" "ldr d21, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" "ldr d19, [x21], #0x8\n" - "ldr d22, [x20], #0x8\n" + "ldr d24, [x20], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v29.s }[2], [x27], #0x4\n" - "ld1 { v28.s }[2], [x26], #0x4\n" - "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v28.s }[2], [x27], #0x4\n" + "ld1 { v29.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" - "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v26.s }[2], [x22], #0x4\n" "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v22.s }[2], [x20], #0x4\n" + "ld1 { v24.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v29.h }[6], [x27], #0x2\n" - "ld1 { v28.h }[6], [x26], #0x2\n" - "ld1 { v27.h }[6], [x25], #0x2\n" + "ld1 { v28.h }[6], [x27], #0x2\n" + "mov x19, #0x4\n" + "ld1 { v29.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v26.h }[6], [x23], #0x2\n" - "ld1 { v25.h }[6], [x22], #0x2\n" + "ld1 { v27.h }[6], [x23], #0x2\n" + "ld1 { v26.h }[6], [x22], #0x2\n" "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v22.h }[6], [x20], #0x2\n" - "mov x19, #0x4\n" + "ld1 { v24.h }[6], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[14], [x27]\n" - "ld1 { v28.b }[14], [x26]\n" - "ld1 { v27.b }[14], [x25]\n" + "ld1 { v28.b }[14], [x27]\n" + "ld1 { v29.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" "ld1 { v21.b }[14], [x24]\n" - "ld1 { v26.b }[14], [x23]\n" - "ld1 { v25.b }[14], [x22]\n" + "ld1 { v27.b }[14], [x23]\n" + "ld1 { v26.b }[14], [x22]\n" "ld1 { v19.b }[14], [x21]\n" - "ld1 { v22.b }[14], [x20]\n" + "ld1 { v24.b }[14], [x20]\n" "b 13f\n" "6:" // odd_loads_1_12 "mov x19, #0x3\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[12], [x27]\n" - "ld1 { v28.b }[12], [x26]\n" - "ld1 { v27.b }[12], [x25]\n" + "ld1 { v28.b }[12], [x27]\n" + "ld1 { v29.b }[12], [x26]\n" + "mov x19, #0x4\n" + "ld1 { v25.b }[12], [x25]\n" "ld1 { v21.b }[12], [x24]\n" - "ld1 { v26.b }[12], [x23]\n" - "ld1 { v25.b }[12], [x22]\n" + "ld1 { v27.b }[12], [x23]\n" + "ld1 { v26.b }[12], [x22]\n" "ld1 { v19.b }[12], [x21]\n" - "ld1 { v22.b }[12], [x20]\n" - "mov x19, #0x4\n" + "ld1 { v24.b }[12], [x20]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v29.h }[4], [x27], #0x2\n" - "ld1 { v28.h }[4], [x26], #0x2\n" - "ld1 { v27.h }[4], [x25], #0x2\n" + "ld1 { v28.h }[4], [x27], #0x2\n" + "ld1 { v29.h }[4], [x26], #0x2\n" + "mov x19, #0x3\n" + "ld1 { v25.h }[4], [x25], #0x2\n" "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v26.h }[4], [x23], #0x2\n" - "ld1 { v25.h }[4], [x22], #0x2\n" + "ld1 { v27.h }[4], [x23], #0x2\n" + "ld1 { v26.h }[4], [x22], #0x2\n" "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v22.h }[4], [x20], #0x2\n" - "mov x19, #0x3\n" + "ld1 { v24.h }[4], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[10], [x27]\n" - "ld1 { v28.b }[10], [x26]\n" - "ld1 { v27.b }[10], [x25]\n" + "ld1 { v28.b }[10], [x27]\n" + "ld1 { v29.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" "ld1 { v21.b }[10], [x24]\n" - "ld1 { v26.b }[10], [x23]\n" - "ld1 { v25.b }[10], [x22]\n" + "ld1 { v27.b }[10], [x23]\n" + "ld1 { v26.b }[10], [x22]\n" "ld1 { v19.b }[10], [x21]\n" - "ld1 { v22.b }[10], [x20]\n" + "ld1 { v24.b }[10], [x20]\n" "b 13f\n" "8:" // odd_loads_1_8 "mov x19, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[8], [x27]\n" - "ld1 { v28.b }[8], [x26]\n" - "ld1 { v27.b }[8], [x25]\n" + "ld1 { v28.b }[8], [x27]\n" + "ld1 { v29.b }[8], [x26]\n" + "mov x19, #0x3\n" + "ld1 { v25.b }[8], [x25]\n" "ld1 { v21.b }[8], [x24]\n" - "ld1 { v26.b }[8], [x23]\n" - "ld1 { v25.b }[8], [x22]\n" + "ld1 { v27.b }[8], [x23]\n" + "ld1 { v26.b }[8], [x22]\n" "ld1 { v19.b }[8], [x21]\n" - "ld1 { v22.b }[8], [x20]\n" - "mov x19, #0x3\n" + "ld1 { v24.b }[8], [x20]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s29, [x27], #0x4\n" - "ldr s28, [x26], #0x4\n" - "ldr s27, [x25], #0x4\n" + "ldr s28, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" "ldr s21, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" "ldr s19, [x21], #0x4\n" - "ldr s22, [x20], #0x4\n" + "ldr s24, [x20], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v29.h }[2], [x27], #0x2\n" - "ld1 { v28.h }[2], [x26], #0x2\n" - "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v28.h }[2], [x27], #0x2\n" + "mov x19, #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v26.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "ld1 { v26.h }[2], [x22], #0x2\n" "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v22.h }[2], [x20], #0x2\n" - "mov x19, #0x2\n" + "ld1 { v24.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[6], [x27]\n" - "ld1 { v28.b }[6], [x26]\n" - "ld1 { v27.b }[6], [x25]\n" + "ld1 { v28.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" "ld1 { v21.b }[6], [x24]\n" - "ld1 { v26.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" + "ld1 { v27.b }[6], [x23]\n" + "ld1 { v26.b }[6], [x22]\n" "ld1 { v19.b }[6], [x21]\n" - "ld1 { v22.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x20]\n" "b 13f\n" "10:" // odd_loads_1_4 "mov x19, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[4], [x27]\n" - "ld1 { v28.b }[4], [x26]\n" - "ld1 { v27.b }[4], [x25]\n" + "ld1 { v28.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "mov x19, #0x2\n" + "ld1 { v25.b }[4], [x25]\n" "ld1 { v21.b }[4], [x24]\n" - "ld1 { v26.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" + "ld1 { v27.b }[4], [x23]\n" + "ld1 { v26.b }[4], [x22]\n" "ld1 { v19.b }[4], [x21]\n" - "ld1 { v22.b }[4], [x20]\n" - "mov x19, #0x2\n" + "ld1 { v24.b }[4], [x20]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h29, [x27], #0x2\n" - "ldr h28, [x26], #0x2\n" - "ldr h27, [x25], #0x2\n" + "ldr h28, [x27], #0x2\n" + "ldr h29, [x26], #0x2\n" + "mov x19, #0x1\n" + "ldr h25, [x25], #0x2\n" "ldr h21, [x24], #0x2\n" - "ldr h26, [x23], #0x2\n" - "ldr h25, [x22], #0x2\n" + "ldr h27, [x23], #0x2\n" + "ldr h26, [x22], #0x2\n" "ldr h19, [x21], #0x2\n" - "ldr h22, [x20], #0x2\n" - "mov x19, #0x1\n" + "ldr h24, [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v29.b }[2], [x27]\n" - "ld1 { v28.b }[2], [x26]\n" - "ld1 { v27.b }[2], [x25]\n" + "ld1 { v28.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" "ld1 { v21.b }[2], [x24]\n" - "ld1 { v26.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" + "ld1 { v27.b }[2], [x23]\n" + "ld1 { v26.b }[2], [x22]\n" "ld1 { v19.b }[2], [x21]\n" - "ld1 { v22.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x20]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b29, [x27, #0x0]\n" - "ldr b28, [x26, #0x0]\n" - "ldr b27, [x25, #0x0]\n" + "ldr b28, [x27, #0x0]\n" + "mov x19, #0x1\n" + "ldr b29, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" "ldr b21, [x24, #0x0]\n" - "ldr b26, [x23, #0x0]\n" - "ldr b25, [x22, #0x0]\n" + "ldr b27, [x23, #0x0]\n" + "ldr b26, [x22, #0x0]\n" "ldr b19, [x21, #0x0]\n" - "ldr b22, [x20, #0x0]\n" - "mov x19, #0x1\n" + "ldr b24, [x20, #0x0]\n" "13:" // Odd load end - "zip1 v23.4s, v29.4s, v27.4s\n" + "zip1 v22.4s, v28.4s, v25.4s\n" "subs x19, x19, #0x1\n" - "zip1 v20.4s, v28.4s, v21.4s\n" - "zip1 v16.4s, v23.4s, v20.4s\n" - "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v26.4s, v19.4s\n" - "uadalp v1.8h, v16.16b\n" - "zip1 v17.4s, v25.4s, v22.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" - "uadalp v0.8h, v16.16b\n" + "zip1 v20.4s, v29.4s, v21.4s\n" + "zip1 v23.4s, v22.4s, v20.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" + "uadalp v1.8h, v23.16b\n" + "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v24.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" + "uadalp v0.8h, v17.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v24.4s, v23.4s, v20.4s\n" - "zip2 v20.4s, v18.4s, v17.4s\n" - "str q24, [%x[out_ptr], #0x0]\n" - "uadalp v1.8h, v24.16b\n" + "zip2 v22.4s, v22.4s, v20.4s\n" + "str q22, [%x[out_ptr], #0x0]\n" + "zip2 v20.4s, v18.4s, v16.4s\n" + "uadalp v1.8h, v22.16b\n" "str q20, [%x[out_ptr], #0x10]\n" - "uadalp v0.8h, v20.16b\n" "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" + "uadalp v0.8h, v20.16b\n" "beq 14f\n" - "zip2 v27.4s, v29.4s, v27.4s\n" - "zip2 v23.4s, v28.4s, v21.4s\n" + "zip2 v28.4s, v28.4s, v25.4s\n" + "zip2 v25.4s, v29.4s, v21.4s\n" "subs x19, x19, #0x1\n" - "zip1 v21.4s, v27.4s, v23.4s\n" + "zip1 v21.4s, v28.4s, v25.4s\n" "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v26.4s, v19.4s\n" "uadalp v1.8h, v21.16b\n" - "zip2 v16.4s, v25.4s, v22.4s\n" + "zip2 v19.4s, v27.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v24.4s\n" "zip1 v18.4s, v19.4s, v16.4s\n" "str q18, [%x[out_ptr], #0x10]\n" "uadalp v0.8h, v18.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v17.4s, v27.4s, v23.4s\n" - "zip2 v16.4s, v19.4s, v16.4s\n" + "zip2 v17.4s, v28.4s, v25.4s\n" "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v19.4s, v16.4s\n" "uadalp v1.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x10]\n" - "uadalp v0.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" + "uadalp v0.8h, v16.16b\n" "14:" // Odds skip "uadalp v31.4s, v1.8h\n" - "uadalp v30.4s, v0.8h\n" "str q31, [%x[out_ptr], #0x0]\n" + "uadalp v30.4s, v0.8h\n" "str q30, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp index 704a4c9210..1330593cbf 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,39 +80,39 @@ void interleave_block<8, 8, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q27, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "subs %x[width], %x[width], #0x10\n" "ldr q24, [x26], #0x10\n" "zip1 v26.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q25, [x25], #0x10\n" + "cmp %x[width], #0x10\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x25, #0x70]\n" "ldr q21, [x24], #0x10\n" - "zip1 v23.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x24, #0x70]\n" - "ldr q22, [x23], #0x10\n" + "ldr q23, [x23], #0x10\n" + "zip1 v22.2d, v25.2d, v21.2d\n" + "ldr q18, [x22], #0x10\n" "zip2 v21.2d, v25.2d, v21.2d\n" + "ldr q20, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "zip1 v19.2d, v23.2d, v18.2d\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v18.2d, v23.2d, v18.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "zip1 v17.2d, v20.2d, v16.2d\n" + "prfm pldl1keep, [x25, #0x70]\n" + "zip2 v16.2d, v20.2d, v16.2d\n" + "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" - "ldr q18, [x22], #0x10\n" - "zip1 v20.2d, v22.2d, v18.2d\n" "prfm pldl1keep, [x22, #0x70]\n" - "ldr q19, [x21], #0x10\n" - "zip2 v18.2d, v22.2d, v18.2d\n" "prfm pldl1keep, [x21, #0x70]\n" - "ldr q16, [x20], #0x10\n" - "zip1 v17.2d, v19.2d, v16.2d\n" "prfm pldl1keep, [x20, #0x70]\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "str q20, [%x[out_ptr], #0x20]\n" + "str q22, [%x[out_ptr], #0x10]\n" + "str q19, [%x[out_ptr], #0x20]\n" "str q17, [%x[out_ptr], #0x30]\n" "str q24, [%x[out_ptr], #0x40]\n" "str q21, [%x[out_ptr], #0x50]\n" "str q18, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" - "subs %x[width], %x[width], #0x10\n" - "cmp %x[width], #0x10\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip @@ -122,37 +122,37 @@ void interleave_block<8, 8, VLType::None, false>( "ldr d24, [x26], #0x8\n" "ldr d25, [x25], #0x8\n" "ldr d21, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" + "ldr d23, [x23], #0x8\n" "ldr d18, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" + "ldr d20, [x21], #0x8\n" "ldr d16, [x20], #0x8\n" "tbz %x[width], #2, 5f\n" "ld1 { v27.s }[2], [x27], #0x4\n" "ld1 { v24.s }[2], [x26], #0x4\n" "ld1 { v25.s }[2], [x25], #0x4\n" "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" "ld1 { v18.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v20.s }[2], [x21], #0x4\n" "ld1 { v16.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 4f\n" "ld1 { v27.h }[6], [x27], #0x2\n" + "mov x19, #0x2\n" "ld1 { v24.h }[6], [x26], #0x2\n" "ld1 { v25.h }[6], [x25], #0x2\n" "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v23.h }[6], [x23], #0x2\n" "ld1 { v18.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v20.h }[6], [x21], #0x2\n" "ld1 { v16.h }[6], [x20], #0x2\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v27.b }[14], [x27]\n" "ld1 { v24.b }[14], [x26]\n" "ld1 { v25.b }[14], [x25]\n" "ld1 { v21.b }[14], [x24]\n" - "ld1 { v22.b }[14], [x23]\n" + "ld1 { v23.b }[14], [x23]\n" "ld1 { v18.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" + "ld1 { v20.b }[14], [x21]\n" "ld1 { v16.b }[14], [x20]\n" "b 11f\n" "4:" // odd_loads_1_12 @@ -162,30 +162,30 @@ void interleave_block<8, 8, VLType::None, false>( "ld1 { v24.b }[12], [x26]\n" "ld1 { v25.b }[12], [x25]\n" "ld1 { v21.b }[12], [x24]\n" - "ld1 { v22.b }[12], [x23]\n" + "ld1 { v23.b }[12], [x23]\n" "ld1 { v18.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" + "ld1 { v20.b }[12], [x21]\n" "ld1 { v16.b }[12], [x20]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" "ld1 { v27.h }[4], [x27], #0x2\n" "ld1 { v24.h }[4], [x26], #0x2\n" + "mov x19, #0x2\n" "ld1 { v25.h }[4], [x25], #0x2\n" "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v23.h }[4], [x23], #0x2\n" "ld1 { v18.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v20.h }[4], [x21], #0x2\n" "ld1 { v16.h }[4], [x20], #0x2\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v27.b }[10], [x27]\n" "ld1 { v24.b }[10], [x26]\n" "ld1 { v25.b }[10], [x25]\n" "ld1 { v21.b }[10], [x24]\n" - "ld1 { v22.b }[10], [x23]\n" + "ld1 { v23.b }[10], [x23]\n" "ld1 { v18.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" + "ld1 { v20.b }[10], [x21]\n" "ld1 { v16.b }[10], [x20]\n" "b 11f\n" "6:" // odd_loads_1_8 @@ -193,13 +193,13 @@ void interleave_block<8, 8, VLType::None, false>( "tbz %x[width], #0, 11f\n" "ld1 { v27.b }[8], [x27]\n" "ld1 { v24.b }[8], [x26]\n" + "mov x19, #0x2\n" "ld1 { v25.b }[8], [x25]\n" "ld1 { v21.b }[8], [x24]\n" - "ld1 { v22.b }[8], [x23]\n" + "ld1 { v23.b }[8], [x23]\n" "ld1 { v18.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" + "ld1 { v20.b }[8], [x21]\n" "ld1 { v16.b }[8], [x20]\n" - "mov x19, #0x2\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" @@ -207,28 +207,28 @@ void interleave_block<8, 8, VLType::None, false>( "ldr s24, [x26], #0x4\n" "ldr s25, [x25], #0x4\n" "ldr s21, [x24], #0x4\n" - "ldr s22, [x23], #0x4\n" + "ldr s23, [x23], #0x4\n" "ldr s18, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" + "ldr s20, [x21], #0x4\n" "ldr s16, [x20], #0x4\n" "tbz %x[width], #1, 8f\n" "ld1 { v27.h }[2], [x27], #0x2\n" + "mov x19, #0x1\n" "ld1 { v24.h }[2], [x26], #0x2\n" "ld1 { v25.h }[2], [x25], #0x2\n" "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" "ld1 { v18.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v20.h }[2], [x21], #0x2\n" "ld1 { v16.h }[2], [x20], #0x2\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 11f\n" "ld1 { v27.b }[6], [x27]\n" "ld1 { v24.b }[6], [x26]\n" "ld1 { v25.b }[6], [x25]\n" "ld1 { v21.b }[6], [x24]\n" - "ld1 { v22.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x23]\n" "ld1 { v18.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" + "ld1 { v20.b }[6], [x21]\n" "ld1 { v16.b }[6], [x20]\n" "b 11f\n" "8:" // odd_loads_1_4 @@ -238,66 +238,66 @@ void interleave_block<8, 8, VLType::None, false>( "ld1 { v24.b }[4], [x26]\n" "ld1 { v25.b }[4], [x25]\n" "ld1 { v21.b }[4], [x24]\n" - "ld1 { v22.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x23]\n" "ld1 { v18.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" + "ld1 { v20.b }[4], [x21]\n" "ld1 { v16.b }[4], [x20]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" "ldr h27, [x27], #0x2\n" "ldr h24, [x26], #0x2\n" + "mov x19, #0x1\n" "ldr h25, [x25], #0x2\n" "ldr h21, [x24], #0x2\n" - "ldr h22, [x23], #0x2\n" + "ldr h23, [x23], #0x2\n" "ldr h18, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" + "ldr h20, [x21], #0x2\n" "ldr h16, [x20], #0x2\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 11f\n" "ld1 { v27.b }[2], [x27]\n" "ld1 { v24.b }[2], [x26]\n" "ld1 { v25.b }[2], [x25]\n" "ld1 { v21.b }[2], [x24]\n" - "ld1 { v22.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x23]\n" "ld1 { v18.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" + "ld1 { v20.b }[2], [x21]\n" "ld1 { v16.b }[2], [x20]\n" "b 11f\n" "10:" // odd_loads_1_0 "ldr b27, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr b24, [x26, #0x0]\n" "ldr b25, [x25, #0x0]\n" "ldr b21, [x24, #0x0]\n" - "ldr b22, [x23, #0x0]\n" + "ldr b23, [x23, #0x0]\n" "ldr b18, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" + "ldr b20, [x21, #0x0]\n" "ldr b16, [x20, #0x0]\n" - "mov x19, #0x1\n" "11:" // Odd load end "zip1 v26.2d, v27.2d, v24.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v23.2d, v25.2d, v21.2d\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v19.2d, v16.2d\n" - "str q20, [%x[out_ptr], #0x20]\n" + "zip1 v22.2d, v25.2d, v21.2d\n" + "subs x19, x19, #0x1\n" + "zip1 v19.2d, v23.2d, v18.2d\n" + "str q22, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v20.2d, v16.2d\n" + "str q19, [%x[out_ptr], #0x20]\n" "str q17, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 12f\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "zip2 v21.2d, v25.2d, v21.2d\n" "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v22.2d, v18.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" + "zip2 v18.2d, v23.2d, v18.2d\n" "str q21, [%x[out_ptr], #0x10]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" + "zip2 v16.2d, v20.2d, v16.2d\n" "str q18, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "12:" // Odds skip - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp index 2317ece790..3550830fc3 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -108,48 +108,48 @@ void interleave_block<8, 8, VLType::None, true>( "mov x19, #0x0\n" "4:" // no_accumulate_16 "ldr q27, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "add x19, x19, #0x1\n" "ldr q24, [x26], #0x10\n" "zip1 v26.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q25, [x25], #0x10\n" + "subs %x[width], %x[width], #0x10\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x25, #0x70]\n" "ldr q21, [x24], #0x10\n" + "sadalp v5.8h, v26.16b\n" "zip1 v23.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x24, #0x70]\n" "ldr q22, [x23], #0x10\n" + "cmp %x[width], #0x10\n" "zip2 v21.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x23, #0x70]\n" "ldr q18, [x22], #0x10\n" + "sadalp v4.8h, v23.16b\n" "zip1 v20.2d, v22.2d, v18.2d\n" - "prfm pldl1keep, [x22, #0x70]\n" "ldr q19, [x21], #0x10\n" + "sadalp v5.8h, v24.16b\n" "zip2 v18.2d, v22.2d, v18.2d\n" - "prfm pldl1keep, [x21, #0x70]\n" "ldr q16, [x20], #0x10\n" + "sadalp v3.8h, v20.16b\n" "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x27, #0x70]\n" + "sadalp v4.8h, v21.16b\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "sadalp v2.8h, v17.16b\n" + "prfm pldl1keep, [x25, #0x70]\n" + "sadalp v3.8h, v18.16b\n" + "prfm pldl1keep, [x24, #0x70]\n" + "sadalp v2.8h, v16.16b\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "prfm pldl1keep, [x21, #0x70]\n" "prfm pldl1keep, [x20, #0x70]\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "sadalp v5.8h, v26.16b\n" "str q23, [%x[out_ptr], #0x10]\n" - "sadalp v4.8h, v23.16b\n" "str q20, [%x[out_ptr], #0x20]\n" - "sadalp v3.8h, v20.16b\n" "str q17, [%x[out_ptr], #0x30]\n" - "sadalp v2.8h, v17.16b\n" "str q24, [%x[out_ptr], #0x40]\n" - "sadalp v5.8h, v24.16b\n" "str q21, [%x[out_ptr], #0x50]\n" - "sadalp v4.8h, v21.16b\n" "str q18, [%x[out_ptr], #0x60]\n" - "sadalp v3.8h, v18.16b\n" "str q16, [%x[out_ptr], #0x70]\n" - "sadalp v2.8h, v16.16b\n" - "add x19, x19, #0x1\n" - "subs %x[width], %x[width], #0x10\n" - "cmp %x[width], #0x10\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 3b\n" "5:" // Main loop skip @@ -174,6 +174,7 @@ void interleave_block<8, 8, VLType::None, true>( "ld1 { v16.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 6f\n" "ld1 { v27.h }[6], [x27], #0x2\n" + "mov x19, #0x2\n" "ld1 { v24.h }[6], [x26], #0x2\n" "ld1 { v25.h }[6], [x25], #0x2\n" "ld1 { v21.h }[6], [x24], #0x2\n" @@ -181,7 +182,6 @@ void interleave_block<8, 8, VLType::None, true>( "ld1 { v18.h }[6], [x22], #0x2\n" "ld1 { v19.h }[6], [x21], #0x2\n" "ld1 { v16.h }[6], [x20], #0x2\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[14], [x27]\n" "ld1 { v24.b }[14], [x26]\n" @@ -208,13 +208,13 @@ void interleave_block<8, 8, VLType::None, true>( "tbz %x[width], #1, 8f\n" "ld1 { v27.h }[4], [x27], #0x2\n" "ld1 { v24.h }[4], [x26], #0x2\n" + "mov x19, #0x2\n" "ld1 { v25.h }[4], [x25], #0x2\n" "ld1 { v21.h }[4], [x24], #0x2\n" "ld1 { v22.h }[4], [x23], #0x2\n" "ld1 { v18.h }[4], [x22], #0x2\n" "ld1 { v19.h }[4], [x21], #0x2\n" "ld1 { v16.h }[4], [x20], #0x2\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[10], [x27]\n" "ld1 { v24.b }[10], [x26]\n" @@ -230,13 +230,13 @@ void interleave_block<8, 8, VLType::None, true>( "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[8], [x27]\n" "ld1 { v24.b }[8], [x26]\n" + "mov x19, #0x2\n" "ld1 { v25.b }[8], [x25]\n" "ld1 { v21.b }[8], [x24]\n" "ld1 { v22.b }[8], [x23]\n" "ld1 { v18.b }[8], [x22]\n" "ld1 { v19.b }[8], [x21]\n" "ld1 { v16.b }[8], [x20]\n" - "mov x19, #0x2\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" @@ -250,6 +250,7 @@ void interleave_block<8, 8, VLType::None, true>( "ldr s16, [x20], #0x4\n" "tbz %x[width], #1, 10f\n" "ld1 { v27.h }[2], [x27], #0x2\n" + "mov x19, #0x1\n" "ld1 { v24.h }[2], [x26], #0x2\n" "ld1 { v25.h }[2], [x25], #0x2\n" "ld1 { v21.h }[2], [x24], #0x2\n" @@ -257,7 +258,6 @@ void interleave_block<8, 8, VLType::None, true>( "ld1 { v18.h }[2], [x22], #0x2\n" "ld1 { v19.h }[2], [x21], #0x2\n" "ld1 { v16.h }[2], [x20], #0x2\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[6], [x27]\n" "ld1 { v24.b }[6], [x26]\n" @@ -284,13 +284,13 @@ void interleave_block<8, 8, VLType::None, true>( "tbz %x[width], #1, 12f\n" "ldr h27, [x27], #0x2\n" "ldr h24, [x26], #0x2\n" + "mov x19, #0x1\n" "ldr h25, [x25], #0x2\n" "ldr h21, [x24], #0x2\n" "ldr h22, [x23], #0x2\n" "ldr h18, [x22], #0x2\n" "ldr h19, [x21], #0x2\n" "ldr h16, [x20], #0x2\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[2], [x27]\n" "ld1 { v24.b }[2], [x26]\n" @@ -303,6 +303,7 @@ void interleave_block<8, 8, VLType::None, true>( "b 13f\n" "12:" // odd_loads_1_0 "ldr b27, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr b24, [x26, #0x0]\n" "ldr b25, [x25, #0x0]\n" "ldr b21, [x24, #0x0]\n" @@ -310,31 +311,30 @@ void interleave_block<8, 8, VLType::None, true>( "ldr b18, [x22, #0x0]\n" "ldr b19, [x21, #0x0]\n" "ldr b16, [x20, #0x0]\n" - "mov x19, #0x1\n" "13:" // Odd load end "zip1 v26.2d, v27.2d, v24.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v23.2d, v25.2d, v21.2d\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v20.2d, v22.2d, v18.2d\n" + "zip1 v23.2d, v25.2d, v21.2d\n" "sadalp v5.8h, v26.16b\n" - "zip1 v17.2d, v19.2d, v16.2d\n" + "zip1 v20.2d, v22.2d, v18.2d\n" "str q23, [%x[out_ptr], #0x10]\n" "sadalp v4.8h, v23.16b\n" + "zip1 v17.2d, v19.2d, v16.2d\n" "str q20, [%x[out_ptr], #0x20]\n" "sadalp v3.8h, v20.16b\n" "str q17, [%x[out_ptr], #0x30]\n" "sadalp v2.8h, v17.16b\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 14f\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "zip2 v21.2d, v25.2d, v21.2d\n" "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v22.2d, v18.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" "sadalp v5.8h, v24.16b\n" - "zip2 v16.2d, v19.2d, v16.2d\n" + "zip2 v18.2d, v22.2d, v18.2d\n" "str q21, [%x[out_ptr], #0x10]\n" "sadalp v4.8h, v21.16b\n" + "zip2 v16.2d, v19.2d, v16.2d\n" "str q18, [%x[out_ptr], #0x20]\n" "sadalp v3.8h, v18.16b\n" "str q16, [%x[out_ptr], #0x30]\n" @@ -352,7 +352,7 @@ void interleave_block<8, 8, VLType::None, true>( "add v0.4s, v0.4s, v28.4s\n" "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp index 07164d6b24..454260ef1a 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -108,48 +108,48 @@ void interleave_block<8, 8, VLType::None, true>( "mov x19, #0x0\n" "4:" // no_accumulate_16 "ldr q27, [x27], #0x10\n" - "prfm pldl1keep, [x27, #0x70]\n" + "add x19, x19, #0x1\n" "ldr q24, [x26], #0x10\n" "zip1 v26.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x26, #0x70]\n" "ldr q25, [x25], #0x10\n" + "subs %x[width], %x[width], #0x10\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "prfm pldl1keep, [x25, #0x70]\n" "ldr q21, [x24], #0x10\n" + "uadalp v5.8h, v26.16b\n" "zip1 v23.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x24, #0x70]\n" "ldr q22, [x23], #0x10\n" + "cmp %x[width], #0x10\n" "zip2 v21.2d, v25.2d, v21.2d\n" - "prfm pldl1keep, [x23, #0x70]\n" "ldr q18, [x22], #0x10\n" + "uadalp v4.8h, v23.16b\n" "zip1 v20.2d, v22.2d, v18.2d\n" - "prfm pldl1keep, [x22, #0x70]\n" "ldr q19, [x21], #0x10\n" + "uadalp v5.8h, v24.16b\n" "zip2 v18.2d, v22.2d, v18.2d\n" - "prfm pldl1keep, [x21, #0x70]\n" "ldr q16, [x20], #0x10\n" + "uadalp v3.8h, v20.16b\n" "zip1 v17.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x27, #0x70]\n" + "uadalp v4.8h, v21.16b\n" + "zip2 v16.2d, v19.2d, v16.2d\n" + "prfm pldl1keep, [x26, #0x70]\n" + "uadalp v2.8h, v17.16b\n" + "prfm pldl1keep, [x25, #0x70]\n" + "uadalp v3.8h, v18.16b\n" + "prfm pldl1keep, [x24, #0x70]\n" + "uadalp v2.8h, v16.16b\n" + "prfm pldl1keep, [x23, #0x70]\n" + "prfm pldl1keep, [x22, #0x70]\n" + "prfm pldl1keep, [x21, #0x70]\n" "prfm pldl1keep, [x20, #0x70]\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "uadalp v5.8h, v26.16b\n" "str q23, [%x[out_ptr], #0x10]\n" - "uadalp v4.8h, v23.16b\n" "str q20, [%x[out_ptr], #0x20]\n" - "uadalp v3.8h, v20.16b\n" "str q17, [%x[out_ptr], #0x30]\n" - "uadalp v2.8h, v17.16b\n" "str q24, [%x[out_ptr], #0x40]\n" - "uadalp v5.8h, v24.16b\n" "str q21, [%x[out_ptr], #0x50]\n" - "uadalp v4.8h, v21.16b\n" "str q18, [%x[out_ptr], #0x60]\n" - "uadalp v3.8h, v18.16b\n" "str q16, [%x[out_ptr], #0x70]\n" - "uadalp v2.8h, v16.16b\n" - "add x19, x19, #0x1\n" - "subs %x[width], %x[width], #0x10\n" - "cmp %x[width], #0x10\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 3b\n" "5:" // Main loop skip @@ -174,6 +174,7 @@ void interleave_block<8, 8, VLType::None, true>( "ld1 { v16.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 6f\n" "ld1 { v27.h }[6], [x27], #0x2\n" + "mov x19, #0x2\n" "ld1 { v24.h }[6], [x26], #0x2\n" "ld1 { v25.h }[6], [x25], #0x2\n" "ld1 { v21.h }[6], [x24], #0x2\n" @@ -181,7 +182,6 @@ void interleave_block<8, 8, VLType::None, true>( "ld1 { v18.h }[6], [x22], #0x2\n" "ld1 { v19.h }[6], [x21], #0x2\n" "ld1 { v16.h }[6], [x20], #0x2\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[14], [x27]\n" "ld1 { v24.b }[14], [x26]\n" @@ -208,13 +208,13 @@ void interleave_block<8, 8, VLType::None, true>( "tbz %x[width], #1, 8f\n" "ld1 { v27.h }[4], [x27], #0x2\n" "ld1 { v24.h }[4], [x26], #0x2\n" + "mov x19, #0x2\n" "ld1 { v25.h }[4], [x25], #0x2\n" "ld1 { v21.h }[4], [x24], #0x2\n" "ld1 { v22.h }[4], [x23], #0x2\n" "ld1 { v18.h }[4], [x22], #0x2\n" "ld1 { v19.h }[4], [x21], #0x2\n" "ld1 { v16.h }[4], [x20], #0x2\n" - "mov x19, #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[10], [x27]\n" "ld1 { v24.b }[10], [x26]\n" @@ -230,13 +230,13 @@ void interleave_block<8, 8, VLType::None, true>( "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[8], [x27]\n" "ld1 { v24.b }[8], [x26]\n" + "mov x19, #0x2\n" "ld1 { v25.b }[8], [x25]\n" "ld1 { v21.b }[8], [x24]\n" "ld1 { v22.b }[8], [x23]\n" "ld1 { v18.b }[8], [x22]\n" "ld1 { v19.b }[8], [x21]\n" "ld1 { v16.b }[8], [x20]\n" - "mov x19, #0x2\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" @@ -250,6 +250,7 @@ void interleave_block<8, 8, VLType::None, true>( "ldr s16, [x20], #0x4\n" "tbz %x[width], #1, 10f\n" "ld1 { v27.h }[2], [x27], #0x2\n" + "mov x19, #0x1\n" "ld1 { v24.h }[2], [x26], #0x2\n" "ld1 { v25.h }[2], [x25], #0x2\n" "ld1 { v21.h }[2], [x24], #0x2\n" @@ -257,7 +258,6 @@ void interleave_block<8, 8, VLType::None, true>( "ld1 { v18.h }[2], [x22], #0x2\n" "ld1 { v19.h }[2], [x21], #0x2\n" "ld1 { v16.h }[2], [x20], #0x2\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[6], [x27]\n" "ld1 { v24.b }[6], [x26]\n" @@ -284,13 +284,13 @@ void interleave_block<8, 8, VLType::None, true>( "tbz %x[width], #1, 12f\n" "ldr h27, [x27], #0x2\n" "ldr h24, [x26], #0x2\n" + "mov x19, #0x1\n" "ldr h25, [x25], #0x2\n" "ldr h21, [x24], #0x2\n" "ldr h22, [x23], #0x2\n" "ldr h18, [x22], #0x2\n" "ldr h19, [x21], #0x2\n" "ldr h16, [x20], #0x2\n" - "mov x19, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[2], [x27]\n" "ld1 { v24.b }[2], [x26]\n" @@ -303,6 +303,7 @@ void interleave_block<8, 8, VLType::None, true>( "b 13f\n" "12:" // odd_loads_1_0 "ldr b27, [x27, #0x0]\n" + "mov x19, #0x1\n" "ldr b24, [x26, #0x0]\n" "ldr b25, [x25, #0x0]\n" "ldr b21, [x24, #0x0]\n" @@ -310,31 +311,30 @@ void interleave_block<8, 8, VLType::None, true>( "ldr b18, [x22, #0x0]\n" "ldr b19, [x21, #0x0]\n" "ldr b16, [x20, #0x0]\n" - "mov x19, #0x1\n" "13:" // Odd load end "zip1 v26.2d, v27.2d, v24.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v23.2d, v25.2d, v21.2d\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v20.2d, v22.2d, v18.2d\n" + "zip1 v23.2d, v25.2d, v21.2d\n" "uadalp v5.8h, v26.16b\n" - "zip1 v17.2d, v19.2d, v16.2d\n" + "zip1 v20.2d, v22.2d, v18.2d\n" "str q23, [%x[out_ptr], #0x10]\n" "uadalp v4.8h, v23.16b\n" + "zip1 v17.2d, v19.2d, v16.2d\n" "str q20, [%x[out_ptr], #0x20]\n" "uadalp v3.8h, v20.16b\n" "str q17, [%x[out_ptr], #0x30]\n" "uadalp v2.8h, v17.16b\n" + "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 14f\n" "zip2 v24.2d, v27.2d, v24.2d\n" - "zip2 v21.2d, v25.2d, v21.2d\n" "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v22.2d, v18.2d\n" + "zip2 v21.2d, v25.2d, v21.2d\n" "uadalp v5.8h, v24.16b\n" - "zip2 v16.2d, v19.2d, v16.2d\n" + "zip2 v18.2d, v22.2d, v18.2d\n" "str q21, [%x[out_ptr], #0x10]\n" "uadalp v4.8h, v21.16b\n" + "zip2 v16.2d, v19.2d, v16.2d\n" "str q18, [%x[out_ptr], #0x20]\n" "uadalp v3.8h, v18.16b\n" "str q16, [%x[out_ptr], #0x30]\n" @@ -352,7 +352,7 @@ void interleave_block<8, 8, VLType::None, true>( "add v0.4s, v0.4s, v28.4s\n" "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - : [out_ptr] "+r" (out_ptr), [width] "+r" (width) + : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp index 24e258e4b8..cccedc6b9c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp @@ -62,14 +62,13 @@ public: // Use the standard fixed size transforms. StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) - { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { switch (ci->get_cpu_model()) { case CPUModel::A55r1: - return { 7.16, 1.14, 0.67 }; + return { 7.16, 1.14, 0.67 }; default: - return { 12.67, 3.98, 1.16 }; + return { 12.67, 3.98, 1.16 }; } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp index d390108b11..fca96f6028 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp @@ -37,7 +37,6 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_bf16fp32_dot_6x16( ARGLIST ); @@ -74,7 +73,6 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_bf16fp32_dot_6x16; - cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp index 85944e9f6a..afb06dedea 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp @@ -96,238 +96,232 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 181f\n" + "bge 176f\n" "cmp %x[M], #0x4\n" - "bgt 145f\n" - "beq 109f\n" + "bgt 141f\n" + "beq 106f\n" "cmp %x[M], #0x2\n" - "bgt 73f\n" - "beq 37f\n" - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop - "cbz x14, 4f\n" - "ldr q8, [x14, #0x0]\n" - "ldr q9, [x14, #0x10]\n" - "ldr q10, [x14, #0x20]\n" - "ldr q11, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "b 15f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 14f\n" - "cmp x16, #0x10\n" - "bge 13f\n" - "tbz x16, #3, 8f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "tbz x16, #2, 6f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "tbz x16, #1, 5f\n" + "bgt 71f\n" + "beq 36f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x9, 3f\n" + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "b 14f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 13f\n" + "cmp x11, #0x10\n" + "bge 12f\n" + "tbz x11, #3, 7f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #2, 5f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #1, 4f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "tbz x16, #0, 12f\n" - "ld1 { v11.s }[2], [x13]\n" - "b 12f\n" - "5:" // Height 1: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 12f\n" - "ldr s11, [x13, #0x0]\n" - "b 12f\n" - "6:" // Height 1: Partial accumulate: partial_2_8 - "tbz x16, #1, 7f\n" - "ldr d10, [x13], #0x8\n" + "tbz x11, #0, 11f\n" + "ldr s11, [x28, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 6f\n" + "ldr d10, [x28], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 12f\n" - "ld1 { v10.s }[2], [x13]\n" - "b 12f\n" - "7:" // Height 1: Partial accumulate: partial_1_8 + "tbz x11, #0, 11f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 12f\n" - "ldr s10, [x13, #0x0]\n" - "b 12f\n" - "8:" // Height 1: Partial accumulate: partial_4_0 - "tbz x16, #2, 10f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "tbz x16, #1, 9f\n" + "tbz x11, #0, 11f\n" + "ldr s10, [x28, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 9f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "tbz x11, #1, 8f\n" + "ldr d9, [x28], #0x8\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "tbz x16, #0, 12f\n" - "ld1 { v9.s }[2], [x13]\n" - "b 12f\n" - "9:" // Height 1: Partial accumulate: partial_1_4 + "tbz x11, #0, 11f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 12f\n" - "ldr s9, [x13, #0x0]\n" - "b 12f\n" - "10:" // Height 1: Partial accumulate: partial_2_0 - "tbz x16, #1, 11f\n" - "ldr d8, [x13], #0x8\n" + "tbz x11, #0, 11f\n" + "ldr s9, [x28, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 10f\n" + "ldr d8, [x28], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 12f\n" - "ld1 { v8.s }[2], [x13]\n" - "b 12f\n" - "11:" // Height 1: Partial accumulate: partial_1_0 + "tbz x11, #0, 11f\n" + "ld1 { v8.s }[2], [x28]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "12:" // Height 1: Partial accumulate: Done - "sub x13, x13, x19\n" - "b 15f\n" - "13:" // Height 1: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "b 15f\n" - "14:" // Height 1: no accumulate + "11:" // Height 1: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 14f\n" + "12:" // Height 1: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "b 14f\n" + "13:" // Height 1: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" "movi v11.16b, #0x0\n" - "15:" // Height 1: setup done - "mov x12, #0x0\n" - "16:" // Height 1: String loop + "14:" // Height 1: setup done + "mov x27, #0x0\n" + "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 17f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 18f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 17f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "b 18f\n" - "17:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "18:" // Height 1: input setup done - "cmp x11, #0x8\n" - "blt 21f\n" - "cmp x11, #0x10\n" + "add x25, x25, x19, LSL #1\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x26, #0x8\n" "blt 20f\n" - "19:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "cmp x26, #0x10\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "ldr q6, [x15, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "cmp x26, #0x10\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q6, [x10, #0x40]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x60]\n" - "sub x11, x11, #0x8\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x70]\n" - "cmp x11, #0x10\n" + "ldr q7, [x10, #0x70]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" - "bge 19b\n" - "20:" // Height 1: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "ldr q6, [x15, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" - "21:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 25f\n" - "cmp x11, #0x2\n" - "blt 23f\n" - "22:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 24f\n" + "cmp x26, #0x2\n" + "blt 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x2\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x2\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "sub x11, x11, #0x2\n" - "add x15, x15, #0x40\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - "cmp x11, #0x2\n" - "bge 22b\n" - "cbz x11, 25f\n" - "23:" // Height 1: Multiply loop: Skip odd blocks - "ldr h0, [x10, #0x0]\n" - "24:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" + "bge 21b\n" + "cbz x26, 24f\n" + "22:" // Height 1: Multiply loop: Skip odd blocks + "ldr h0, [x25, #0x0]\n" + "23:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - "25:" // Height 1: Multiply loop: No odd multiplies + "24:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 16b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "tbz %x[flags], #1, 26f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 15b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "tbz %x[flags], #1, 25f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -340,178 +334,170 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "fmax v9.4s, v9.4s, v1.4s\n" "fmax v10.4s, v10.4s, v1.4s\n" "fmax v11.4s, v11.4s, v1.4s\n" - "26:" // Height 1: No activation - "cmp x16, #0x10\n" - "bge 35f\n" - "tbz x16, #3, 30f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "tbz x16, #2, 28f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "tbz x16, #1, 27f\n" - "str d11, [x13], #0x8\n" - "tbz x16, #0, 34f\n" - "st1 { v11.s }[2], [x13]\n" - "b 34f\n" - "27:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x16, #0, 34f\n" - "str s11, [x13, #0x0]\n" - "b 34f\n" - "28:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x16, #1, 29f\n" - "str d10, [x13], #0x8\n" - "tbz x16, #0, 34f\n" - "st1 { v10.s }[2], [x13]\n" - "b 34f\n" - "29:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x16, #0, 34f\n" - "str s10, [x13, #0x0]\n" - "b 34f\n" - "30:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x16, #2, 32f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "tbz x16, #1, 31f\n" - "str d9, [x13], #0x8\n" - "tbz x16, #0, 34f\n" - "st1 { v9.s }[2], [x13]\n" - "b 34f\n" - "31:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x16, #0, 34f\n" - "str s9, [x13, #0x0]\n" - "b 34f\n" - "32:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x16, #1, 33f\n" - "str d8, [x13], #0x8\n" - "tbz x16, #0, 34f\n" - "st1 { v8.s }[2], [x13]\n" - "b 34f\n" - "33:" // Height 1: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "34:" // Height 1: Partial direct writeback: Done - "b 36f\n" - "35:" // Height 1: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "add x13, x13, #0x40\n" - "36:" // Height 1: Writeback done - "subs x16, x16, #0x10\n" - "bgt 3b\n" - "b 218f\n" - "37:" // Height 2 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 38f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 39f\n" - "38:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "39:" // Height 2: Column loop - "cbz x14, 40f\n" - "ldr q8, [x14, #0x0]\n" + "25:" // Height 1: No activation + "cmp x11, #0x10\n" + "bge 34f\n" + "tbz x11, #3, 29f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #2, 27f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #1, 26f\n" + "str d11, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v11.s }[2], [x28]\n" + "b 33f\n" + "26:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 33f\n" + "str s11, [x28, #0x0]\n" + "b 33f\n" + "27:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 28f\n" + "str d10, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v10.s }[2], [x28]\n" + "b 33f\n" + "28:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 33f\n" + "str s10, [x28, #0x0]\n" + "b 33f\n" + "29:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 31f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x11, #1, 30f\n" + "str d9, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v9.s }[2], [x28]\n" + "b 33f\n" + "30:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 33f\n" + "str s9, [x28, #0x0]\n" + "b 33f\n" + "31:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 32f\n" + "str d8, [x28], #0x8\n" + "tbz x11, #0, 33f\n" + "st1 { v8.s }[2], [x28]\n" + "b 33f\n" + "32:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "33:" // Height 1: Partial direct writeback: Done + "b 35f\n" + "34:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "35:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 212f\n" + "36:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "37:" // Height 2: Column loop + "cbz x9, 38f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" - "ldr q10, [x14, #0x20]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" "mov v13.16b, v9.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v14.16b, v10.16b\n" - "add x14, x14, #0x40\n" "mov v15.16b, v11.16b\n" - "b 51f\n" - "40:" // Height 2: no bias - "tbz %x[flags], #0, 50f\n" - "cmp x16, #0x10\n" - "bge 49f\n" - "tbz x16, #3, 44f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "tbz x16, #2, 42f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "tbz x16, #1, 41f\n" + "b 49f\n" + "38:" // Height 2: no bias + "tbz %x[flags], #0, 48f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "bge 47f\n" + "tbz x11, #3, 42f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "tbz x11, #2, 40f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "tbz x11, #1, 39f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "tbz x16, #0, 48f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "b 48f\n" - "41:" // Height 2: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "tbz x11, #0, 46f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "b 46f\n" + "39:" // Height 2: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 48f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "b 48f\n" - "42:" // Height 2: Partial accumulate: partial_2_8 - "tbz x16, #1, 43f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" + "tbz x11, #0, 46f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "b 46f\n" + "40:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 41f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 48f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "b 48f\n" - "43:" // Height 2: Partial accumulate: partial_1_8 + "tbz x11, #0, 46f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "b 46f\n" + "41:" // Height 2: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 48f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "b 48f\n" - "44:" // Height 2: Partial accumulate: partial_4_0 - "tbz x16, #2, 46f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "tbz x16, #1, 45f\n" + "tbz x11, #0, 46f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "b 46f\n" + "42:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 44f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "tbz x11, #1, 43f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "tbz x16, #0, 48f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "b 48f\n" - "45:" // Height 2: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "tbz x11, #0, 46f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "b 46f\n" + "43:" // Height 2: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 48f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "b 48f\n" - "46:" // Height 2: Partial accumulate: partial_2_0 - "tbz x16, #1, 47f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" + "tbz x11, #0, 46f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "b 46f\n" + "44:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 45f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 48f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "b 48f\n" - "47:" // Height 2: Partial accumulate: partial_1_0 + "tbz x11, #0, 46f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "b 46f\n" + "45:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "48:" // Height 2: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "b 51f\n" - "49:" // Height 2: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "b 51f\n" - "50:" // Height 2: no accumulate + "ldr s12, [x24, #0x0]\n" + "46:" // Height 2: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 49f\n" + "47:" // Height 2: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "b 49f\n" + "48:" // Height 2: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -520,195 +506,197 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "movi v13.16b, #0x0\n" "movi v14.16b, #0x0\n" "movi v15.16b, #0x0\n" - "51:" // Height 2: setup done - "mov x12, #0x0\n" - "52:" // Height 2: String loop + "49:" // Height 2: setup done + "mov x27, #0x0\n" + "50:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 53f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 51f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 54f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 52f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "b 54f\n" - "53:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "54:" // Height 2: input setup done - "cmp x11, #0x8\n" - "blt 57f\n" - "cmp x11, #0x10\n" - "blt 56f\n" - "55:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 52f\n" + "51:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "52:" // Height 2: input setup done + "cmp x26, #0x8\n" + "blt 55f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q6, [x10, #0x0]\n" + "blt 54f\n" + "53:" // Height 2: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "cmp x26, #0x10\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "sub x11, x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x40]\n" - "cmp x11, #0x10\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" - "bge 55b\n" - "56:" // Height 2: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "bge 53b\n" + "54:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" - "57:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 61f\n" - "cmp x11, #0x2\n" - "blt 59f\n" - "58:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "55:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 59f\n" + "cmp x26, #0x2\n" + "blt 57f\n" + "56:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x2\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x2\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "sub x11, x11, #0x2\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "cmp x11, #0x2\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - "bge 58b\n" - "cbz x11, 61f\n" - "59:" // Height 2: Multiply loop: Skip odd blocks - "ldr h0, [x10, #0x0]\n" - "ldr h1, [x28, #0x0]\n" - "60:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" + "bge 56b\n" + "cbz x26, 59f\n" + "57:" // Height 2: Multiply loop: Skip odd blocks + "ldr h0, [x25, #0x0]\n" + "ldr h1, [x24, #0x0]\n" + "58:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - "61:" // Height 2: Multiply loop: No odd multiplies + "59:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 52b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbz %x[flags], #1, 62f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 50b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 60f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -729,227 +717,215 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "fmax v14.4s, v14.4s, v1.4s\n" "fmin v15.4s, v15.4s, v0.4s\n" "fmax v15.4s, v15.4s, v1.4s\n" - "62:" // Height 2: No activation - "cmp x16, #0x10\n" - "bge 71f\n" - "tbz x16, #3, 66f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "tbz x16, #2, 64f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "tbz x16, #1, 63f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "tbz x16, #0, 70f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "b 70f\n" - "63:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x16, #0, 70f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "b 70f\n" - "64:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x16, #1, 65f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "tbz x16, #0, 70f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "b 70f\n" - "65:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x16, #0, 70f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "b 70f\n" - "66:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x16, #2, 68f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "tbz x16, #1, 67f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "tbz x16, #0, 70f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "b 70f\n" - "67:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x16, #0, 70f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "b 70f\n" - "68:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x16, #1, 69f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "tbz x16, #0, 70f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" + "60:" // Height 2: No activation + "cmp x11, #0x10\n" + "bge 69f\n" + "tbz x11, #3, 64f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "tbz x11, #2, 62f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "tbz x11, #1, 61f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "b 68f\n" + "61:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 68f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "b 68f\n" + "62:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 63f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "b 68f\n" + "63:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 68f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 66f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "tbz x11, #1, 65f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "b 68f\n" + "65:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 68f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "b 68f\n" + "66:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 67f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "tbz x11, #0, 68f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "b 68f\n" + "67:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "68:" // Height 2: Partial direct writeback: Done "b 70f\n" - "69:" // Height 2: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "70:" // Height 2: Partial direct writeback: Done - "b 72f\n" - "71:" // Height 2: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "72:" // Height 2: Writeback done - "subs x16, x16, #0x10\n" - "bgt 39b\n" - "b 218f\n" - "73:" // Height 3 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 74f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 75f\n" - "74:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "75:" // Height 3: Column loop - "cbz x14, 76f\n" - "ldr q8, [x14, #0x0]\n" + "69:" // Height 2: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "70:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 37b\n" + "b 212f\n" + "71:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "72:" // Height 3: Column loop + "cbz x9, 73f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" - "ldr q11, [x14, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" "mov v13.16b, v9.16b\n" - "add x14, x14, #0x40\n" + "add x9, x9, #0x40\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" - "b 87f\n" - "76:" // Height 3: no bias - "tbz %x[flags], #0, 86f\n" - "cmp x16, #0x10\n" - "bge 85f\n" - "tbz x16, #3, 80f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "tbz x16, #2, 78f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "tbz x16, #1, 77f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "tbz x16, #0, 84f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" "b 84f\n" - "77:" // Height 3: Partial accumulate: partial_1_12 + "73:" // Height 3: no bias + "tbz %x[flags], #0, 83f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 82f\n" + "tbz x11, #3, 77f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #2, 75f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #1, 74f\n" + "mov x19, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "b 81f\n" + "74:" // Height 3: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 84f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "b 84f\n" - "78:" // Height 3: Partial accumulate: partial_2_8 - "tbz x16, #1, 79f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" + "tbz x11, #0, 81f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "b 81f\n" + "75:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 76f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 84f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "b 84f\n" - "79:" // Height 3: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "b 81f\n" + "76:" // Height 3: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 84f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "b 84f\n" - "80:" // Height 3: Partial accumulate: partial_4_0 - "tbz x16, #2, 82f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "tbz x16, #1, 81f\n" + "tbz x11, #0, 81f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "b 81f\n" + "77:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 79f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "tbz x11, #1, 78f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "tbz x16, #0, 84f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "b 84f\n" - "81:" // Height 3: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "b 81f\n" + "78:" // Height 3: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 84f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "b 84f\n" - "82:" // Height 3: Partial accumulate: partial_2_0 - "tbz x16, #1, 83f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" + "tbz x11, #0, 81f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "b 81f\n" + "79:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 80f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 84f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "b 84f\n" - "83:" // Height 3: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "tbz x11, #0, 81f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "b 81f\n" + "80:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "84:" // Height 3: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "b 87f\n" - "85:" // Height 3: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "b 87f\n" - "86:" // Height 3: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "81:" // Height 3: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 84f\n" + "82:" // Height 3: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "b 84f\n" + "83:" // Height 3: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -962,247 +938,250 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "movi v17.16b, #0x0\n" "movi v18.16b, #0x0\n" "movi v19.16b, #0x0\n" - "87:" // Height 3: setup done - "mov x12, #0x0\n" - "88:" // Height 3: String loop + "84:" // Height 3: setup done + "mov x27, #0x0\n" + "85:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 89f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 86f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 90f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 87f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" - "b 90f\n" - "89:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "90:" // Height 3: input setup done - "cmp x11, #0x8\n" - "blt 93f\n" - "cmp x11, #0x10\n" - "blt 92f\n" - "91:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 87f\n" + "86:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "87:" // Height 3: input setup done + "cmp x26, #0x8\n" + "blt 90f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x26, #0x10\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "sub x11, x11, #0x8\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - "cmp x11, #0x10\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" - "bge 91b\n" - "92:" // Height 3: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "bge 88b\n" + "89:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" - "93:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 97f\n" - "cmp x11, #0x2\n" - "blt 95f\n" - "94:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "90:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 94f\n" + "cmp x26, #0x2\n" + "blt 92f\n" + "91:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x2\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x2\n" + "ldr s2, [x23], #0x4\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x2\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "cmp x11, #0x2\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - "bge 94b\n" - "cbz x11, 97f\n" - "95:" // Height 3: Multiply loop: Skip odd blocks - "ldr h0, [x10, #0x0]\n" - "ldr h1, [x28, #0x0]\n" - "ldr h2, [x26, #0x0]\n" - "96:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" + "bge 91b\n" + "cbz x26, 94f\n" + "92:" // Height 3: Multiply loop: Skip odd blocks + "ldr h0, [x25, #0x0]\n" + "ldr h1, [x24, #0x0]\n" + "ldr h2, [x23, #0x0]\n" + "93:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - "97:" // Height 3: Multiply loop: No odd multiplies + "94:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 88b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "tbz %x[flags], #1, 98f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 85b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 95f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1231,137 +1210,120 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "fmin v19.4s, v19.4s, v0.4s\n" "fmax v18.4s, v18.4s, v1.4s\n" "fmax v19.4s, v19.4s, v1.4s\n" - "98:" // Height 3: No activation - "cmp x16, #0x10\n" - "bge 107f\n" - "tbz x16, #3, 102f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "tbz x16, #2, 100f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "tbz x16, #1, 99f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "tbz x16, #0, 106f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "b 106f\n" - "99:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x16, #0, 106f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "b 106f\n" - "100:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x16, #1, 101f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "tbz x16, #0, 106f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "b 106f\n" - "101:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x16, #0, 106f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "b 106f\n" - "102:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x16, #2, 104f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "tbz x16, #1, 103f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "tbz x16, #0, 106f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "b 106f\n" - "103:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x16, #0, 106f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "b 106f\n" - "104:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x16, #1, 105f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "tbz x16, #0, 106f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "b 106f\n" - "105:" // Height 3: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "106:" // Height 3: Partial direct writeback: Done - "b 108f\n" - "107:" // Height 3: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "108:" // Height 3: Writeback done - "subs x16, x16, #0x10\n" - "bgt 75b\n" - "b 218f\n" - "109:" // Height 4 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 110f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 111f\n" - "110:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "111:" // Height 4: Column loop - "cbz x14, 112f\n" - "ldr q8, [x14, #0x0]\n" + "95:" // Height 3: No activation + "cmp x11, #0x10\n" + "bge 104f\n" + "tbz x11, #3, 99f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #2, 97f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #1, 96f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "b 103f\n" + "96:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 103f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "b 103f\n" + "97:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 98f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "b 103f\n" + "98:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 103f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "b 103f\n" + "99:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 101f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "tbz x11, #1, 100f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "b 103f\n" + "100:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 103f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "b 103f\n" + "101:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 102f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "tbz x11, #0, 103f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "b 103f\n" + "102:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "103:" // Height 3: Partial direct writeback: Done + "b 105f\n" + "104:" // Height 3: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "105:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 72b\n" + "b 212f\n" + "106:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "107:" // Height 4: Column loop + "cbz x9, 108f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" - "add x14, x14, #0x40\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v13.16b, v9.16b\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" @@ -1371,136 +1333,137 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" "mov v23.16b, v11.16b\n" - "b 123f\n" - "112:" // Height 4: no bias - "tbz %x[flags], #0, 122f\n" - "cmp x16, #0x10\n" - "bge 121f\n" - "tbz x16, #3, 116f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "tbz x16, #2, 114f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "tbz x16, #1, 113f\n" + "b 119f\n" + "108:" // Height 4: no bias + "tbz %x[flags], #0, 118f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 117f\n" + "tbz x11, #3, 112f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "tbz x11, #2, 110f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "tbz x11, #1, 109f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "tbz x16, #0, 120f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "b 120f\n" - "113:" // Height 4: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "b 116f\n" + "109:" // Height 4: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 120f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "b 120f\n" - "114:" // Height 4: Partial accumulate: partial_2_8 - "tbz x16, #1, 115f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" + "tbz x11, #0, 116f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "b 116f\n" + "110:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 111f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 120f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "b 120f\n" - "115:" // Height 4: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "b 116f\n" + "111:" // Height 4: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 120f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "b 120f\n" - "116:" // Height 4: Partial accumulate: partial_4_0 - "tbz x16, #2, 118f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "tbz x16, #1, 117f\n" + "tbz x11, #0, 116f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "b 116f\n" + "112:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 114f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "tbz x11, #1, 113f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "tbz x16, #0, 120f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "b 120f\n" - "117:" // Height 4: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "b 116f\n" + "113:" // Height 4: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 120f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "b 120f\n" - "118:" // Height 4: Partial accumulate: partial_2_0 - "tbz x16, #1, 119f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" + "tbz x11, #0, 116f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "b 116f\n" + "114:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 115f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 120f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "b 120f\n" - "119:" // Height 4: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "tbz x11, #0, 116f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "b 116f\n" + "115:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "120:" // Height 4: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "b 123f\n" - "121:" // Height 4: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "b 123f\n" - "122:" // Height 4: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "116:" // Height 4: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 119f\n" + "117:" // Height 4: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "b 119f\n" + "118:" // Height 4: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -1517,220 +1480,220 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "movi v21.16b, #0x0\n" "movi v22.16b, #0x0\n" "movi v23.16b, #0x0\n" - "123:" // Height 4: setup done - "mov x12, #0x0\n" - "124:" // Height 4: String loop + "119:" // Height 4: setup done + "mov x27, #0x0\n" + "120:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 125f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 121f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 126f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 122f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" - "b 126f\n" - "125:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "126:" // Height 4: input setup done - "cmp x11, #0x8\n" - "blt 129f\n" - "cmp x11, #0x10\n" - "blt 128f\n" - "127:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 122f\n" + "121:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "122:" // Height 4: input setup done + "cmp x26, #0x8\n" + "blt 125f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 124f\n" + "123:" // Height 4: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x10\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x11, x11, #0x8\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "cmp x11, #0x10\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" - "bge 127b\n" - "128:" // Height 4: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "bge 123b\n" + "124:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" @@ -1739,31 +1702,31 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" - "129:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 133f\n" - "cmp x11, #0x2\n" - "blt 131f\n" - "130:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "125:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 129f\n" + "cmp x26, #0x2\n" + "blt 127f\n" + "126:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x2\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x2\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x2\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "cmp x11, #0x2\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" @@ -1771,28 +1734,28 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - "bge 130b\n" - "cbz x11, 133f\n" - "131:" // Height 4: Multiply loop: Skip odd blocks - "ldr h0, [x10, #0x0]\n" - "ldr h1, [x28, #0x0]\n" - "ldr h2, [x26, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "132:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" + "bge 126b\n" + "cbz x26, 129f\n" + "127:" // Height 4: Multiply loop: Skip odd blocks + "ldr h0, [x25, #0x0]\n" + "ldr h1, [x24, #0x0]\n" + "ldr h2, [x23, #0x0]\n" + "ldr h3, [x22, #0x0]\n" + "128:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" @@ -1800,16 +1763,20 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - "133:" // Height 4: Multiply loop: No odd multiplies + "129:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 124b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "tbz %x[flags], #1, 134f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 120b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 130f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1846,162 +1813,141 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "fmax v21.4s, v21.4s, v1.4s\n" "fmax v22.4s, v22.4s, v1.4s\n" "fmax v23.4s, v23.4s, v1.4s\n" - "134:" // Height 4: No activation - "cmp x16, #0x10\n" - "bge 143f\n" - "tbz x16, #3, 138f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "tbz x16, #2, 136f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "tbz x16, #1, 135f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "tbz x16, #0, 142f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "b 142f\n" - "135:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x16, #0, 142f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "b 142f\n" - "136:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x16, #1, 137f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "tbz x16, #0, 142f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "b 142f\n" - "137:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x16, #0, 142f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "b 142f\n" - "138:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x16, #2, 140f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "tbz x16, #1, 139f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "tbz x16, #0, 142f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "b 142f\n" - "139:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x16, #0, 142f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "b 142f\n" - "140:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x16, #1, 141f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x16, #0, 142f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "b 142f\n" - "141:" // Height 4: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "142:" // Height 4: Partial direct writeback: Done - "b 144f\n" - "143:" // Height 4: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "144:" // Height 4: Writeback done - "subs x16, x16, #0x10\n" - "bgt 111b\n" - "b 218f\n" - "145:" // Height 5 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 146f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 147f\n" - "146:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "147:" // Height 5: Column loop - "cbz x14, 148f\n" - "ldr q8, [x14, #0x0]\n" + "130:" // Height 4: No activation + "cmp x11, #0x10\n" + "bge 139f\n" + "tbz x11, #3, 134f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "tbz x11, #2, 132f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "tbz x11, #1, 131f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "b 138f\n" + "131:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 138f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "b 138f\n" + "132:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 133f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "b 138f\n" + "133:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 138f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "b 138f\n" + "134:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 136f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "tbz x11, #1, 135f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "b 138f\n" + "135:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 138f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "b 138f\n" + "136:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 137f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "tbz x11, #0, 138f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "b 138f\n" + "137:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "138:" // Height 4: Partial direct writeback: Done + "b 140f\n" + "139:" // Height 4: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "140:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 107b\n" + "b 212f\n" + "141:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "142:" // Height 5: Column loop + "cbz x9, 143f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v24.16b, v8.16b\n" - "add x14, x14, #0x40\n" "mov v13.16b, v9.16b\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" @@ -2014,157 +1960,158 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" "mov v27.16b, v11.16b\n" - "b 159f\n" - "148:" // Height 5: no bias - "tbz %x[flags], #0, 158f\n" - "cmp x16, #0x10\n" - "bge 157f\n" - "tbz x16, #3, 152f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "tbz x16, #2, 150f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "tbz x16, #1, 149f\n" + "b 154f\n" + "143:" // Height 5: no bias + "tbz %x[flags], #0, 153f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 152f\n" + "tbz x11, #3, 147f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #2, 145f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #1, 144f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "tbz x16, #0, 156f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "b 156f\n" - "149:" // Height 5: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "b 151f\n" + "144:" // Height 5: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 156f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "b 156f\n" - "150:" // Height 5: Partial accumulate: partial_2_8 - "tbz x16, #1, 151f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" + "tbz x11, #0, 151f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "b 151f\n" + "145:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 146f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 156f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "b 156f\n" - "151:" // Height 5: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "b 151f\n" + "146:" // Height 5: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 156f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "b 156f\n" - "152:" // Height 5: Partial accumulate: partial_4_0 - "tbz x16, #2, 154f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "tbz x16, #1, 153f\n" + "tbz x11, #0, 151f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "b 151f\n" + "147:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 149f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "tbz x11, #1, 148f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "tbz x16, #0, 156f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "b 156f\n" - "153:" // Height 5: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "b 151f\n" + "148:" // Height 5: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 156f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "b 156f\n" - "154:" // Height 5: Partial accumulate: partial_2_0 - "tbz x16, #1, 155f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" + "tbz x11, #0, 151f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "b 151f\n" + "149:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 150f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 156f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "b 156f\n" - "155:" // Height 5: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "tbz x11, #0, 151f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "b 151f\n" + "150:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "156:" // Height 5: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "b 159f\n" - "157:" // Height 5: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "b 159f\n" - "158:" // Height 5: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "151:" // Height 5: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 154f\n" + "152:" // Height 5: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "b 154f\n" + "153:" // Height 5: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -2185,260 +2132,260 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" - "159:" // Height 5: setup done - "mov x12, #0x0\n" - "160:" // Height 5: String loop + "154:" // Height 5: setup done + "mov x27, #0x0\n" + "155:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 161f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 156f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 162f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 157f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" - "b 162f\n" - "161:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "162:" // Height 5: input setup done - "cmp x11, #0x8\n" - "blt 165f\n" - "cmp x11, #0x10\n" - "blt 164f\n" - "163:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x21, x21, x19, LSL #1\n" + "b 157f\n" + "156:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "157:" // Height 5: input setup done + "cmp x26, #0x8\n" + "blt 160f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 159f\n" + "158:" // Height 5: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x26, #0x10\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x11, x11, #0x8\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "cmp x11, #0x10\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - "add x15, x15, #0x100\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" - "bge 163b\n" - "164:" // Height 5: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "bge 158b\n" + "159:" // Height 5: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x22, x22, #0x10\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - "add x15, x15, #0x100\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" @@ -2448,34 +2395,34 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" - "165:" // Height 5: Multiply loop: Main loop skip - "cbz x11, 169f\n" - "cmp x11, #0x2\n" - "blt 167f\n" - "166:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "160:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 164f\n" + "cmp x26, #0x2\n" + "blt 162f\n" + "161:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x2\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x2\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x2\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "cmp x11, #0x2\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" @@ -2485,31 +2432,31 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - "bge 166b\n" - "cbz x11, 169f\n" - "167:" // Height 5: Multiply loop: Skip odd blocks - "ldr h0, [x10, #0x0]\n" - "ldr h1, [x28, #0x0]\n" - "ldr h2, [x26, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "ldr h4, [x22, #0x0]\n" - "168:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" + "bge 161b\n" + "cbz x26, 164f\n" + "162:" // Height 5: Multiply loop: Skip odd blocks + "ldr h0, [x25, #0x0]\n" + "ldr h1, [x24, #0x0]\n" + "ldr h2, [x23, #0x0]\n" + "ldr h3, [x22, #0x0]\n" + "ldr h4, [x21, #0x0]\n" + "163:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" @@ -2519,17 +2466,22 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - "169:" // Height 5: Multiply loop: No odd multiplies + "164:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 160b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 155b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" - "tbz %x[flags], #1, 170f\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 165f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2574,193 +2526,169 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "fmax v26.4s, v26.4s, v1.4s\n" "fmin v27.4s, v27.4s, v0.4s\n" "fmax v27.4s, v27.4s, v1.4s\n" - "170:" // Height 5: No activation - "cmp x16, #0x10\n" - "bge 179f\n" - "tbz x16, #3, 174f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "tbz x16, #2, 172f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "tbz x16, #1, 171f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "tbz x16, #0, 178f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "b 178f\n" - "171:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x16, #0, 178f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "b 178f\n" - "172:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x16, #1, 173f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "tbz x16, #0, 178f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "b 178f\n" - "173:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x16, #0, 178f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "b 178f\n" - "174:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x16, #2, 176f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "tbz x16, #1, 175f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "tbz x16, #0, 178f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "b 178f\n" - "175:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x16, #0, 178f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "b 178f\n" - "176:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x16, #1, 177f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x16, #0, 178f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "b 178f\n" - "177:" // Height 5: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "178:" // Height 5: Partial direct writeback: Done - "b 180f\n" - "179:" // Height 5: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "180:" // Height 5: Writeback done - "subs x16, x16, #0x10\n" - "bgt 147b\n" - "b 218f\n" - "181:" // Height 6 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 182f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 183f\n" - "182:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "183:" // Height 6: Column loop - "cbz x14, 184f\n" - "ldr q8, [x14, #0x0]\n" + "165:" // Height 5: No activation + "cmp x11, #0x10\n" + "bge 174f\n" + "tbz x11, #3, 169f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #2, 167f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #1, 166f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "b 173f\n" + "166:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 173f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "b 173f\n" + "167:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 168f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "b 173f\n" + "168:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 173f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "b 173f\n" + "169:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 171f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "tbz x11, #1, 170f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "b 173f\n" + "170:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 173f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "b 173f\n" + "171:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 172f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x11, #0, 173f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "b 173f\n" + "172:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "173:" // Height 5: Partial direct writeback: Done + "b 175f\n" + "174:" // Height 5: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "175:" // Height 5: Writeback done + "subs x11, x11, #0x10\n" + "bgt 142b\n" + "b 212f\n" + "176:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "177:" // Height 6: Column loop + "cbz x9, 178f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v24.16b, v8.16b\n" - "add x14, x14, #0x40\n" "mov v28.16b, v8.16b\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" "mov v21.16b, v9.16b\n" @@ -2772,178 +2700,179 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "mov v29.16b, v9.16b\n" "mov v30.16b, v10.16b\n" "mov v31.16b, v11.16b\n" - "b 195f\n" - "184:" // Height 6: no bias - "tbz %x[flags], #0, 194f\n" - "cmp x16, #0x10\n" - "bge 193f\n" - "tbz x16, #3, 188f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v29.4s }, [x21], #0x10\n" - "tbz x16, #2, 186f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "ld1 { v30.4s }, [x21], #0x10\n" - "tbz x16, #1, 185f\n" + "b 189f\n" + "178:" // Height 6: no bias + "tbz %x[flags], #0, 188f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 187f\n" + "tbz x11, #3, 182f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v29.4s }, [x20], #0x10\n" + "tbz x11, #2, 180f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "ld1 { v30.4s }, [x20], #0x10\n" + "tbz x11, #1, 179f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d31, [x21], #0x8\n" - "tbz x16, #0, 192f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x21]\n" - "b 192f\n" - "185:" // Height 6: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 186f\n" + "179:" // Height 6: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 192f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "ldr s31, [x21, #0x0]\n" - "b 192f\n" - "186:" // Height 6: Partial accumulate: partial_2_8 - "tbz x16, #1, 187f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d30, [x21], #0x8\n" + "tbz x11, #0, 186f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "b 186f\n" + "180:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 181f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 192f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x21]\n" - "b 192f\n" - "187:" // Height 6: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "ld1 { v30.s }[2], [x20]\n" + "b 186f\n" + "181:" // Height 6: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 192f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "ldr s30, [x21, #0x0]\n" - "b 192f\n" - "188:" // Height 6: Partial accumulate: partial_4_0 - "tbz x16, #2, 190f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "tbz x16, #1, 189f\n" + "tbz x11, #0, 186f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "ldr s30, [x20, #0x0]\n" + "b 186f\n" + "182:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 184f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "tbz x11, #1, 183f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d29, [x21], #0x8\n" - "tbz x16, #0, 192f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "ld1 { v29.s }[2], [x21]\n" - "b 192f\n" - "189:" // Height 6: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "ld1 { v29.s }[2], [x20]\n" + "b 186f\n" + "183:" // Height 6: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 192f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "ldr s29, [x21, #0x0]\n" - "b 192f\n" - "190:" // Height 6: Partial accumulate: partial_2_0 - "tbz x16, #1, 191f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d28, [x21], #0x8\n" + "tbz x11, #0, 186f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "ldr s29, [x20, #0x0]\n" + "b 186f\n" + "184:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 185f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 192f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "ld1 { v28.s }[2], [x21]\n" - "b 192f\n" - "191:" // Height 6: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x11, #0, 186f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "ld1 { v28.s }[2], [x20]\n" + "b 186f\n" + "185:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "ldr s28, [x21, #0x0]\n" - "192:" // Height 6: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "sub x21, x21, x19\n" - "b 195f\n" - "193:" // Height 6: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "ldr q28, [x21, #0x0]\n" - "ldr q29, [x21, #0x10]\n" - "ldr q30, [x21, #0x20]\n" - "ldr q31, [x21, #0x30]\n" - "b 195f\n" - "194:" // Height 6: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "ldr s28, [x20, #0x0]\n" + "186:" // Height 6: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 189f\n" + "187:" // Height 6: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x20, #0x0]\n" + "ldr q29, [x20, #0x10]\n" + "ldr q30, [x20, #0x20]\n" + "ldr q31, [x20, #0x30]\n" + "b 189f\n" + "188:" // Height 6: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -2968,299 +2897,299 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "movi v29.16b, #0x0\n" "movi v30.16b, #0x0\n" "movi v31.16b, #0x0\n" - "195:" // Height 6: setup done - "mov x12, #0x0\n" - "196:" // Height 6: String loop + "189:" // Height 6: setup done + "mov x27, #0x0\n" + "190:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 197f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 191f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 198f\n" + "cbnz x27, 192f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" "add x20, x20, x19, LSL #1\n" - "b 198f\n" - "197:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "add x20, x22, x19, LSL #1\n" - "198:" // Height 6: input setup done - "cmp x11, #0x8\n" - "blt 201f\n" - "cmp x11, #0x10\n" - "blt 200f\n" - "199:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" + "b 192f\n" + "191:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "192:" // Height 6: input setup done + "cmp x26, #0x8\n" + "blt 195f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 194f\n" + "193:" // Height 6: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x21, x21, #0x10\n" ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x20, x20, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "add x20, x20, #0x10\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x11, x11, #0x8\n" + "cmp x26, #0x10\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "cmp x11, #0x10\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - "add x15, x15, #0x100\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + "ldr q4, [x21, #0x0]\n" ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n" - "bge 199b\n" - "200:" // Height 6: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "bge 193b\n" + "194:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" + ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" "add x22, x22, #0x10\n" + ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "add x21, x21, #0x10\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "ldr q6, [x10, #0x20]\n" "add x20, x20, #0x10\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - "add x15, x15, #0x100\n" ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" @@ -3272,37 +3201,37 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n" - "201:" // Height 6: Multiply loop: Main loop skip - "cbz x11, 205f\n" - "cmp x11, #0x2\n" - "blt 203f\n" - "202:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" + "195:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 199f\n" + "cmp x26, #0x2\n" + "blt 197f\n" + "196:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x2\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x2\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" "ldr s5, [x20], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x2\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "cmp x11, #0x2\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" @@ -3314,34 +3243,34 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" - "bge 202b\n" - "cbz x11, 205f\n" - "203:" // Height 6: Multiply loop: Skip odd blocks - "ldr h0, [x10, #0x0]\n" - "ldr h1, [x28, #0x0]\n" - "ldr h2, [x26, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "ldr h4, [x22, #0x0]\n" + "bge 196b\n" + "cbz x26, 199f\n" + "197:" // Height 6: Multiply loop: Skip odd blocks + "ldr h0, [x25, #0x0]\n" + "ldr h1, [x24, #0x0]\n" + "ldr h2, [x23, #0x0]\n" + "ldr h3, [x22, #0x0]\n" + "ldr h4, [x21, #0x0]\n" "ldr h5, [x20, #0x0]\n" - "204:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" + "198:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x10, #0x0]\n" ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" @@ -3353,18 +3282,24 @@ void a64_hybrid_bf16fp32_dot_6x16 ( ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" - "205:" // Height 6: Multiply loop: No odd multiplies + "199:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 196b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 190b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" "prfm pstl1keep, [x21, #0x0]\n" - "tbz %x[flags], #1, 206f\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbz %x[flags], #1, 200f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -3417,185 +3352,180 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "fmin v31.4s, v31.4s, v0.4s\n" "fmax v30.4s, v30.4s, v1.4s\n" "fmax v31.4s, v31.4s, v1.4s\n" - "206:" // Height 6: No activation - "cmp x16, #0x10\n" - "bge 215f\n" - "tbz x16, #3, 210f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "st1 { v29.4s }, [x21], #0x10\n" - "tbz x16, #2, 208f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "st1 { v30.4s }, [x21], #0x10\n" - "tbz x16, #1, 207f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x16, #0, 214f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "st1 { v31.s }[2], [x21]\n" - "b 214f\n" - "207:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x16, #0, 214f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "str s31, [x21, #0x0]\n" - "b 214f\n" - "208:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x16, #1, 209f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "str d30, [x21], #0x8\n" - "tbz x16, #0, 214f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "st1 { v30.s }[2], [x21]\n" - "b 214f\n" - "209:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x16, #0, 214f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "str s30, [x21, #0x0]\n" - "b 214f\n" - "210:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x16, #2, 212f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "tbz x16, #1, 211f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "str d29, [x21], #0x8\n" - "tbz x16, #0, 214f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "st1 { v29.s }[2], [x21]\n" - "b 214f\n" - "211:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x16, #0, 214f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "str s29, [x21, #0x0]\n" - "b 214f\n" - "212:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x16, #1, 213f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x16, #0, 214f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "st1 { v28.s }[2], [x21]\n" - "b 214f\n" - "213:" // Height 6: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "str s28, [x21, #0x0]\n" - "214:" // Height 6: Partial direct writeback: Done - "b 216f\n" - "215:" // Height 6: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "str q28, [x21, #0x0]\n" - "str q29, [x21, #0x10]\n" - "str q30, [x21, #0x20]\n" - "str q31, [x21, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "add x21, x21, #0x40\n" - "216:" // Height 6: Writeback done - "subs x16, x16, #0x10\n" - "bgt 183b\n" + "200:" // Height 6: No activation + "cmp x11, #0x10\n" + "bge 209f\n" + "tbz x11, #3, 204f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v25.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "st1 { v29.4s }, [x20], #0x10\n" + "tbz x11, #2, 202f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "st1 { v30.4s }, [x20], #0x10\n" + "tbz x11, #1, 201f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "str d31, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "st1 { v31.s }[2], [x20]\n" + "b 208f\n" + "201:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 208f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "str s31, [x20, #0x0]\n" + "b 208f\n" + "202:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 203f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "str d30, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "st1 { v30.s }[2], [x20]\n" + "b 208f\n" + "203:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 208f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "str s30, [x20, #0x0]\n" + "b 208f\n" + "204:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 206f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "tbz x11, #1, 205f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "str d29, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "st1 { v29.s }[2], [x20]\n" + "b 208f\n" + "205:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 208f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "str s29, [x20, #0x0]\n" + "b 208f\n" + "206:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 207f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "str d28, [x20], #0x8\n" + "tbz x11, #0, 208f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "st1 { v28.s }[2], [x20]\n" + "b 208f\n" + "207:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "str s28, [x20, #0x0]\n" + "208:" // Height 6: Partial direct writeback: Done + "b 210f\n" + "209:" // Height 6: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "str q28, [x20, #0x0]\n" + "str q29, [x20, #0x10]\n" + "str q30, [x20, #0x20]\n" + "str q31, [x20, #0x30]\n" + "210:" // Height 6: Writeback done + "subs x11, x11, #0x10\n" + "bgt 177b\n" "subs %x[M], %x[M], #0x6\n" - "beq 218f\n" + "beq 212f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 217f\n" + "tbz %x[flags], #3, 211f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "217:" // Update direct input + "211:" // Update direct input "mov x19, #0xc\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "218:" // Exit + "212:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp index ca2696bebd..674d71d626 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_fp16_mla_6x32( ARGLIST ); +void a64_hybrid_fp16_mla_6x32_a55( ARGLIST ); class cls_a64_hybrid_fp16_mla_6x32 { @@ -72,10 +72,11 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: - return { 5.22 }; + return { 6.94 }; default: return { 14.53 }; } @@ -83,9 +84,15 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_fp16_mla_6x32; - - cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *) + cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_hybrid_fp16_mla_6x32_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp new file mode 100644 index 0000000000..87c73740e7 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp @@ -0,0 +1,5757 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp16_mla_6x32_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg, + size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg, + const __fp16 *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + __fp16 maxval = static_cast<__fp16>(std::numeric_limits::infinity()); + __fp16 minval = - static_cast<__fp16>(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const __fp16 *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast<__fp16>(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 246f\n" + "cmp %x[M], #0x4\n" + "bgt 197f\n" + "beq 148f\n" + "cmp %x[M], #0x2\n" + "bgt 99f\n" + "beq 50f\n" + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x16, %x[bias]\n" + "mov x15, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x16, 3f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "b 22f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 21f\n" + "cmp x8, #0x20\n" + "bge 20f\n" + "tbz x8, #4, 11f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v9.8h }, [x15], #0x10\n" + "tbz x8, #3, 7f\n" + "ld1 { v10.8h }, [x15], #0x10\n" + "tbz x8, #2, 5f\n" + "ldr d11, [x15], #0x8\n" + "tbz x8, #1, 4f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x15], #0x4\n" + "tbz x8, #0, 19f\n" + "ld1 { v11.h }[6], [x15]\n" + "b 19f\n" + "4:" // Height 1: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x8, #0, 19f\n" + "ld1 { v11.h }[4], [x15]\n" + "b 19f\n" + "5:" // Height 1: Partial accumulate: partial_2_24 + "tbz x8, #1, 6f\n" + "ldr s11, [x15], #0x4\n" + "mov x19, #0x34\n" + "tbz x8, #0, 19f\n" + "ld1 { v11.h }[2], [x15]\n" + "b 19f\n" + "6:" // Height 1: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x8, #0, 19f\n" + "ldr h11, [x15, #0x0]\n" + "b 19f\n" + "7:" // Height 1: Partial accumulate: partial_4_16 + "tbz x8, #2, 9f\n" + "ldr d10, [x15], #0x8\n" + "tbz x8, #1, 8f\n" + "mov x19, #0x2c\n" + "ld1 { v10.s }[2], [x15], #0x4\n" + "tbz x8, #0, 19f\n" + "ld1 { v10.h }[6], [x15]\n" + "b 19f\n" + "8:" // Height 1: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x8, #0, 19f\n" + "ld1 { v10.h }[4], [x15]\n" + "b 19f\n" + "9:" // Height 1: Partial accumulate: partial_2_16 + "tbz x8, #1, 10f\n" + "ldr s10, [x15], #0x4\n" + "mov x19, #0x24\n" + "tbz x8, #0, 19f\n" + "ld1 { v10.h }[2], [x15]\n" + "b 19f\n" + "10:" // Height 1: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x8, #0, 19f\n" + "ldr h10, [x15, #0x0]\n" + "b 19f\n" + "11:" // Height 1: Partial accumulate: partial_8_0 + "tbz x8, #3, 15f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "tbz x8, #2, 13f\n" + "ldr d9, [x15], #0x8\n" + "tbz x8, #1, 12f\n" + "mov x19, #0x1c\n" + "ld1 { v9.s }[2], [x15], #0x4\n" + "tbz x8, #0, 19f\n" + "ld1 { v9.h }[6], [x15]\n" + "b 19f\n" + "12:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x8, #0, 19f\n" + "ld1 { v9.h }[4], [x15]\n" + "b 19f\n" + "13:" // Height 1: Partial accumulate: partial_2_8 + "tbz x8, #1, 14f\n" + "ldr s9, [x15], #0x4\n" + "mov x19, #0x14\n" + "tbz x8, #0, 19f\n" + "ld1 { v9.h }[2], [x15]\n" + "b 19f\n" + "14:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x8, #0, 19f\n" + "ldr h9, [x15, #0x0]\n" + "b 19f\n" + "15:" // Height 1: Partial accumulate: partial_4_0 + "tbz x8, #2, 17f\n" + "ldr d8, [x15], #0x8\n" + "tbz x8, #1, 16f\n" + "mov x19, #0xc\n" + "ld1 { v8.s }[2], [x15], #0x4\n" + "tbz x8, #0, 19f\n" + "ld1 { v8.h }[6], [x15]\n" + "b 19f\n" + "16:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x8, #0, 19f\n" + "ld1 { v8.h }[4], [x15]\n" + "b 19f\n" + "17:" // Height 1: Partial accumulate: partial_2_0 + "tbz x8, #1, 18f\n" + "ldr s8, [x15], #0x4\n" + "mov x19, #0x4\n" + "tbz x8, #0, 19f\n" + "ld1 { v8.h }[2], [x15]\n" + "b 19f\n" + "18:" // Height 1: Partial accumulate: partial_1_0 + "ldr h8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "19:" // Height 1: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 22f\n" + "20:" // Height 1: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "b 22f\n" + "21:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "22:" // Height 1: setup done + "mov x14, #0x0\n" + "23:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 24f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "cbnz x14, 25f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #1\n" + "b 25f\n" + "24:" // Height 1: setup direct input + "mov x12, %x[input_ptr]\n" + "25:" // Height 1: input setup done + "cmp x13, #0x8\n" + "blt 28f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "cmp x13, #0x10\n" + "blt 27f\n" + "26:" // Height 1: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr d7, [x17, #0x10]\n" + "ldr x11, [x17, #0x18]\n" + "add x12, x12, #0x10\n" + "ldr d6, [x17, #0x20]\n" + "sub x13, x13, #0x8\n" + "ldr x10, [x17, #0x28]\n" + "cmp x13, #0x10\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr d6, [x17, #0x40]\n" + "ldr x10, [x17, #0x48]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x58]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr d6, [x17, #0x60]\n" + "ldr x10, [x17, #0x68]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x78]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr d6, [x17, #0x80]\n" + "ldr x10, [x17, #0x88]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x98]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr d6, [x17, #0xa0]\n" + "ldr x10, [x17, #0xa8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr d6, [x17, #0xc0]\n" + "ldr x10, [x17, #0xc8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr d6, [x17, #0xe0]\n" + "ldr x10, [x17, #0xe8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr d6, [x17, #0x100]\n" + "ldr x10, [x17, #0x108]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x118]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr d6, [x17, #0x120]\n" + "ldr x10, [x17, #0x128]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x138]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr d6, [x17, #0x140]\n" + "ldr x10, [x17, #0x148]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x158]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr d6, [x17, #0x160]\n" + "ldr x10, [x17, #0x168]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x178]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr d6, [x17, #0x180]\n" + "ldr x10, [x17, #0x188]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x198]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr d6, [x17, #0x1a0]\n" + "ldr x10, [x17, #0x1a8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x1b8]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr d6, [x17, #0x1c0]\n" + "ldr x10, [x17, #0x1c8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x1d8]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr d6, [x17, #0x1e0]\n" + "ldr x10, [x17, #0x1e8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x1f8]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x1f0]\n" + "add x17, x17, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "ldr d6, [x17, #0x0]\n" + "ldr x10, [x17, #0x8]\n" + "mov v7.d[1], x11\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "mov v6.d[1], x10\n" + "ldr d0, [x12, #0x0]\n" + "mov v0.d[1], x9\n" + "bge 26b\n" + "27:" // Height 1: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x10]\n" + "ldr q6, [x17, #0x20]\n" + "sub x13, x13, #0x8\n" + "add x12, x12, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr q7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "28:" // Height 1: Multiply loop: Main loop skip + "cbz x13, 30f\n" + "29:" // Height 1: Multiply loop: Odd block loop + "ldr h0, [x12], #0x2\n" + "sub x13, x13, #0x1\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "cbnz x13, 29b\n" + "30:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 23b\n" + "prfm pstl1keep, [x15, #0x0]\n" + "tbz %x[flags], #1, 31f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "31:" // Height 1: No activation + "cmp x8, #0x20\n" + "bge 48f\n" + "tbz x8, #4, 39f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v9.8h }, [x15], #0x10\n" + "tbz x8, #3, 35f\n" + "st1 { v10.8h }, [x15], #0x10\n" + "tbz x8, #2, 33f\n" + "str d11, [x15], #0x8\n" + "tbz x8, #1, 32f\n" + "st1 { v11.s }[2], [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v11.h }[6], [x15]\n" + "b 47f\n" + "32:" // Height 1: Partial direct writeback: partial_1_28 + "tbz x8, #0, 47f\n" + "st1 { v11.h }[4], [x15]\n" + "b 47f\n" + "33:" // Height 1: Partial direct writeback: partial_2_24 + "tbz x8, #1, 34f\n" + "str s11, [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v11.h }[2], [x15]\n" + "b 47f\n" + "34:" // Height 1: Partial direct writeback: partial_1_24 + "tbz x8, #0, 47f\n" + "str h11, [x15, #0x0]\n" + "b 47f\n" + "35:" // Height 1: Partial direct writeback: partial_4_16 + "tbz x8, #2, 37f\n" + "str d10, [x15], #0x8\n" + "tbz x8, #1, 36f\n" + "st1 { v10.s }[2], [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v10.h }[6], [x15]\n" + "b 47f\n" + "36:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x8, #0, 47f\n" + "st1 { v10.h }[4], [x15]\n" + "b 47f\n" + "37:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x8, #1, 38f\n" + "str s10, [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v10.h }[2], [x15]\n" + "b 47f\n" + "38:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x8, #0, 47f\n" + "str h10, [x15, #0x0]\n" + "b 47f\n" + "39:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x8, #3, 43f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "tbz x8, #2, 41f\n" + "str d9, [x15], #0x8\n" + "tbz x8, #1, 40f\n" + "st1 { v9.s }[2], [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v9.h }[6], [x15]\n" + "b 47f\n" + "40:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x8, #0, 47f\n" + "st1 { v9.h }[4], [x15]\n" + "b 47f\n" + "41:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x8, #1, 42f\n" + "str s9, [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v9.h }[2], [x15]\n" + "b 47f\n" + "42:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x8, #0, 47f\n" + "str h9, [x15, #0x0]\n" + "b 47f\n" + "43:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x8, #2, 45f\n" + "str d8, [x15], #0x8\n" + "tbz x8, #1, 44f\n" + "st1 { v8.s }[2], [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v8.h }[6], [x15]\n" + "b 47f\n" + "44:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x8, #0, 47f\n" + "st1 { v8.h }[4], [x15]\n" + "b 47f\n" + "45:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x8, #1, 46f\n" + "str s8, [x15], #0x4\n" + "tbz x8, #0, 47f\n" + "st1 { v8.h }[2], [x15]\n" + "b 47f\n" + "46:" // Height 1: Partial direct writeback: partial_1_0 + "str h8, [x15, #0x0]\n" + "47:" // Height 1: Partial direct writeback: Done + "b 49f\n" + "48:" // Height 1: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "49:" // Height 1: Writeback done + "subs x8, x8, #0x20\n" + "bgt 2b\n" + "b 296f\n" + "50:" // Height 2 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "51:" // Height 2: Column loop + "cbz x16, 52f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" + "mov v13.16b, v9.16b\n" + "add x16, x16, #0x40\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "b 71f\n" + "52:" // Height 2: no bias + "tbz %x[flags], #0, 70f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x20\n" + "add x25, x15, x19, LSL #1\n" + "bge 69f\n" + "tbz x8, #4, 60f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v9.8h }, [x15], #0x10\n" + "ld1 { v13.8h }, [x25], #0x10\n" + "tbz x8, #3, 56f\n" + "ld1 { v10.8h }, [x15], #0x10\n" + "ld1 { v14.8h }, [x25], #0x10\n" + "tbz x8, #2, 54f\n" + "ldr d11, [x15], #0x8\n" + "ldr d15, [x25], #0x8\n" + "tbz x8, #1, 53f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x15], #0x4\n" + "ld1 { v15.s }[2], [x25], #0x4\n" + "tbz x8, #0, 68f\n" + "ld1 { v11.h }[6], [x15]\n" + "ld1 { v15.h }[6], [x25]\n" + "b 68f\n" + "53:" // Height 2: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x8, #0, 68f\n" + "ld1 { v11.h }[4], [x15]\n" + "ld1 { v15.h }[4], [x25]\n" + "b 68f\n" + "54:" // Height 2: Partial accumulate: partial_2_24 + "tbz x8, #1, 55f\n" + "ldr s11, [x15], #0x4\n" + "ldr s15, [x25], #0x4\n" + "mov x19, #0x34\n" + "tbz x8, #0, 68f\n" + "ld1 { v11.h }[2], [x15]\n" + "ld1 { v15.h }[2], [x25]\n" + "b 68f\n" + "55:" // Height 2: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x8, #0, 68f\n" + "ldr h11, [x15, #0x0]\n" + "ldr h15, [x25, #0x0]\n" + "b 68f\n" + "56:" // Height 2: Partial accumulate: partial_4_16 + "tbz x8, #2, 58f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "tbz x8, #1, 57f\n" + "mov x19, #0x2c\n" + "ld1 { v10.s }[2], [x15], #0x4\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "tbz x8, #0, 68f\n" + "ld1 { v10.h }[6], [x15]\n" + "ld1 { v14.h }[6], [x25]\n" + "b 68f\n" + "57:" // Height 2: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x8, #0, 68f\n" + "ld1 { v10.h }[4], [x15]\n" + "ld1 { v14.h }[4], [x25]\n" + "b 68f\n" + "58:" // Height 2: Partial accumulate: partial_2_16 + "tbz x8, #1, 59f\n" + "ldr s10, [x15], #0x4\n" + "ldr s14, [x25], #0x4\n" + "mov x19, #0x24\n" + "tbz x8, #0, 68f\n" + "ld1 { v10.h }[2], [x15]\n" + "ld1 { v14.h }[2], [x25]\n" + "b 68f\n" + "59:" // Height 2: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x8, #0, 68f\n" + "ldr h10, [x15, #0x0]\n" + "ldr h14, [x25, #0x0]\n" + "b 68f\n" + "60:" // Height 2: Partial accumulate: partial_8_0 + "tbz x8, #3, 64f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "tbz x8, #2, 62f\n" + "ldr d9, [x15], #0x8\n" + "ldr d13, [x25], #0x8\n" + "tbz x8, #1, 61f\n" + "mov x19, #0x1c\n" + "ld1 { v9.s }[2], [x15], #0x4\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "tbz x8, #0, 68f\n" + "ld1 { v9.h }[6], [x15]\n" + "ld1 { v13.h }[6], [x25]\n" + "b 68f\n" + "61:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x8, #0, 68f\n" + "ld1 { v9.h }[4], [x15]\n" + "ld1 { v13.h }[4], [x25]\n" + "b 68f\n" + "62:" // Height 2: Partial accumulate: partial_2_8 + "tbz x8, #1, 63f\n" + "ldr s9, [x15], #0x4\n" + "ldr s13, [x25], #0x4\n" + "mov x19, #0x14\n" + "tbz x8, #0, 68f\n" + "ld1 { v9.h }[2], [x15]\n" + "ld1 { v13.h }[2], [x25]\n" + "b 68f\n" + "63:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x8, #0, 68f\n" + "ldr h9, [x15, #0x0]\n" + "ldr h13, [x25, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial accumulate: partial_4_0 + "tbz x8, #2, 66f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "tbz x8, #1, 65f\n" + "mov x19, #0xc\n" + "ld1 { v8.s }[2], [x15], #0x4\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "tbz x8, #0, 68f\n" + "ld1 { v8.h }[6], [x15]\n" + "ld1 { v12.h }[6], [x25]\n" + "b 68f\n" + "65:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x8, #0, 68f\n" + "ld1 { v8.h }[4], [x15]\n" + "ld1 { v12.h }[4], [x25]\n" + "b 68f\n" + "66:" // Height 2: Partial accumulate: partial_2_0 + "tbz x8, #1, 67f\n" + "ldr s8, [x15], #0x4\n" + "ldr s12, [x25], #0x4\n" + "mov x19, #0x4\n" + "tbz x8, #0, 68f\n" + "ld1 { v8.h }[2], [x15]\n" + "ld1 { v12.h }[2], [x25]\n" + "b 68f\n" + "67:" // Height 2: Partial accumulate: partial_1_0 + "ldr h8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr h12, [x25, #0x0]\n" + "68:" // Height 2: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 71f\n" + "69:" // Height 2: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "b 71f\n" + "70:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "71:" // Height 2: setup done + "mov x14, #0x0\n" + "72:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 73f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x14, 74f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "b 74f\n" + "73:" // Height 2: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #1\n" + "74:" // Height 2: input setup done + "cmp x13, #0x8\n" + "blt 77f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x10\n" + "ldr q6, [x17, #0x0]\n" + "blt 76f\n" + "75:" // Height 2: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr x11, [x17, #0x18]\n" + "ldr d6, [x17, #0x20]\n" + "add x12, x12, #0x10\n" + "ldr x10, [x17, #0x28]\n" + "add x28, x28, #0x10\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sub x13, x13, #0x8\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr d6, [x17, #0x40]\n" + "ldr x10, [x17, #0x48]\n" + "cmp x13, #0x10\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x58]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr d6, [x17, #0x60]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x78]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr d6, [x17, #0x80]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x98]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "ldr d6, [x17, #0xa0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "ldr d6, [x17, #0xc0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "ldr d6, [x17, #0xe0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr x10, [x17, #0x108]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "ldr d6, [x17, #0x100]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x118]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "ldr d7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr x10, [x17, #0x128]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "ldr d6, [x17, #0x120]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x138]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "ldr d7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr x10, [x17, #0x148]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "ldr d6, [x17, #0x140]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x158]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "ldr d7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr x10, [x17, #0x168]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "ldr d6, [x17, #0x160]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x178]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "ldr d7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr x10, [x17, #0x188]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "ldr d6, [x17, #0x180]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x198]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "ldr d7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr x10, [x17, #0x1a8]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "ldr d6, [x17, #0x1a0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x1b8]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "ldr d7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr x10, [x17, #0x1c8]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "ldr d6, [x17, #0x1c0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x1d8]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "ldr d7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr x10, [x17, #0x1e8]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "ldr d6, [x17, #0x1e0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x1f8]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr d7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "ldr d6, [x17, #0x0]\n" + "mov v7.d[1], x11\n" + "ldr x10, [x17, #0x8]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "mov v6.d[1], x10\n" + "ldr d1, [x28, #0x0]\n" + "mov v0.d[1], x9\n" + "mov v1.d[1], x27\n" + "bge 75b\n" + "76:" // Height 2: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "sub x13, x13, #0x8\n" + "add x12, x12, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr q7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "77:" // Height 2: Multiply loop: Main loop skip + "cbz x13, 79f\n" + "78:" // Height 2: Multiply loop: Odd block loop + "ldr h0, [x12], #0x2\n" + "sub x13, x13, #0x1\n" + "ldr h1, [x28], #0x2\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "cbnz x13, 78b\n" + "79:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 72b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 80f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.8h }, [x20]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "80:" // Height 2: No activation + "cmp x8, #0x20\n" + "bge 97f\n" + "tbz x8, #4, 88f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v9.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v13.8h }, [x25], #0x10\n" + "tbz x8, #3, 84f\n" + "st1 { v10.8h }, [x15], #0x10\n" + "st1 { v14.8h }, [x25], #0x10\n" + "tbz x8, #2, 82f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "tbz x8, #1, 81f\n" + "st1 { v11.s }[2], [x15], #0x4\n" + "st1 { v15.s }[2], [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v11.h }[6], [x15]\n" + "st1 { v15.h }[6], [x25]\n" + "b 96f\n" + "81:" // Height 2: Partial direct writeback: partial_1_28 + "tbz x8, #0, 96f\n" + "st1 { v11.h }[4], [x15]\n" + "st1 { v15.h }[4], [x25]\n" + "b 96f\n" + "82:" // Height 2: Partial direct writeback: partial_2_24 + "tbz x8, #1, 83f\n" + "str s11, [x15], #0x4\n" + "str s15, [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v11.h }[2], [x15]\n" + "st1 { v15.h }[2], [x25]\n" + "b 96f\n" + "83:" // Height 2: Partial direct writeback: partial_1_24 + "tbz x8, #0, 96f\n" + "str h11, [x15, #0x0]\n" + "str h15, [x25, #0x0]\n" + "b 96f\n" + "84:" // Height 2: Partial direct writeback: partial_4_16 + "tbz x8, #2, 86f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "tbz x8, #1, 85f\n" + "st1 { v10.s }[2], [x15], #0x4\n" + "st1 { v14.s }[2], [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v10.h }[6], [x15]\n" + "st1 { v14.h }[6], [x25]\n" + "b 96f\n" + "85:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x8, #0, 96f\n" + "st1 { v10.h }[4], [x15]\n" + "st1 { v14.h }[4], [x25]\n" + "b 96f\n" + "86:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x8, #1, 87f\n" + "str s10, [x15], #0x4\n" + "str s14, [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v10.h }[2], [x15]\n" + "st1 { v14.h }[2], [x25]\n" + "b 96f\n" + "87:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x8, #0, 96f\n" + "str h10, [x15, #0x0]\n" + "str h14, [x25, #0x0]\n" + "b 96f\n" + "88:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x8, #3, 92f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "tbz x8, #2, 90f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "tbz x8, #1, 89f\n" + "st1 { v9.s }[2], [x15], #0x4\n" + "st1 { v13.s }[2], [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v9.h }[6], [x15]\n" + "st1 { v13.h }[6], [x25]\n" + "b 96f\n" + "89:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x8, #0, 96f\n" + "st1 { v9.h }[4], [x15]\n" + "st1 { v13.h }[4], [x25]\n" + "b 96f\n" + "90:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x8, #1, 91f\n" + "str s9, [x15], #0x4\n" + "str s13, [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v9.h }[2], [x15]\n" + "st1 { v13.h }[2], [x25]\n" + "b 96f\n" + "91:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x8, #0, 96f\n" + "str h9, [x15, #0x0]\n" + "str h13, [x25, #0x0]\n" + "b 96f\n" + "92:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x8, #2, 94f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "tbz x8, #1, 93f\n" + "st1 { v8.s }[2], [x15], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v8.h }[6], [x15]\n" + "st1 { v12.h }[6], [x25]\n" + "b 96f\n" + "93:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x8, #0, 96f\n" + "st1 { v8.h }[4], [x15]\n" + "st1 { v12.h }[4], [x25]\n" + "b 96f\n" + "94:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x8, #1, 95f\n" + "str s8, [x15], #0x4\n" + "str s12, [x25], #0x4\n" + "tbz x8, #0, 96f\n" + "st1 { v8.h }[2], [x15]\n" + "st1 { v12.h }[2], [x25]\n" + "b 96f\n" + "95:" // Height 2: Partial direct writeback: partial_1_0 + "str h8, [x15, #0x0]\n" + "str h12, [x25, #0x0]\n" + "96:" // Height 2: Partial direct writeback: Done + "b 98f\n" + "97:" // Height 2: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "98:" // Height 2: Writeback done + "subs x8, x8, #0x20\n" + "bgt 51b\n" + "b 296f\n" + "99:" // Height 3 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "100:" // Height 3: Column loop + "cbz x16, 101f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "b 120f\n" + "101:" // Height 3: no bias + "tbz %x[flags], #0, 119f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x20\n" + "add x25, x15, x19, LSL #1\n" + "add x24, x25, x19, LSL #1\n" + "bge 118f\n" + "tbz x8, #4, 109f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v9.8h }, [x15], #0x10\n" + "ld1 { v13.8h }, [x25], #0x10\n" + "ld1 { v17.8h }, [x24], #0x10\n" + "tbz x8, #3, 105f\n" + "ld1 { v10.8h }, [x15], #0x10\n" + "ld1 { v14.8h }, [x25], #0x10\n" + "ld1 { v18.8h }, [x24], #0x10\n" + "tbz x8, #2, 103f\n" + "ldr d11, [x15], #0x8\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "tbz x8, #1, 102f\n" + "ld1 { v11.s }[2], [x15], #0x4\n" + "mov x19, #0x3c\n" + "ld1 { v15.s }[2], [x25], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v11.h }[6], [x15]\n" + "ld1 { v15.h }[6], [x25]\n" + "ld1 { v19.h }[6], [x24]\n" + "b 117f\n" + "102:" // Height 3: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x8, #0, 117f\n" + "ld1 { v11.h }[4], [x15]\n" + "ld1 { v15.h }[4], [x25]\n" + "ld1 { v19.h }[4], [x24]\n" + "b 117f\n" + "103:" // Height 3: Partial accumulate: partial_2_24 + "tbz x8, #1, 104f\n" + "ldr s11, [x15], #0x4\n" + "ldr s15, [x25], #0x4\n" + "mov x19, #0x34\n" + "ldr s19, [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v11.h }[2], [x15]\n" + "ld1 { v15.h }[2], [x25]\n" + "ld1 { v19.h }[2], [x24]\n" + "b 117f\n" + "104:" // Height 3: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x8, #0, 117f\n" + "ldr h11, [x15, #0x0]\n" + "ldr h15, [x25, #0x0]\n" + "ldr h19, [x24, #0x0]\n" + "b 117f\n" + "105:" // Height 3: Partial accumulate: partial_4_16 + "tbz x8, #2, 107f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "ldr d18, [x24], #0x8\n" + "tbz x8, #1, 106f\n" + "ld1 { v10.s }[2], [x15], #0x4\n" + "mov x19, #0x2c\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "ld1 { v18.s }[2], [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v10.h }[6], [x15]\n" + "ld1 { v14.h }[6], [x25]\n" + "ld1 { v18.h }[6], [x24]\n" + "b 117f\n" + "106:" // Height 3: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x8, #0, 117f\n" + "ld1 { v10.h }[4], [x15]\n" + "ld1 { v14.h }[4], [x25]\n" + "ld1 { v18.h }[4], [x24]\n" + "b 117f\n" + "107:" // Height 3: Partial accumulate: partial_2_16 + "tbz x8, #1, 108f\n" + "ldr s10, [x15], #0x4\n" + "ldr s14, [x25], #0x4\n" + "mov x19, #0x24\n" + "ldr s18, [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v10.h }[2], [x15]\n" + "ld1 { v14.h }[2], [x25]\n" + "ld1 { v18.h }[2], [x24]\n" + "b 117f\n" + "108:" // Height 3: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x8, #0, 117f\n" + "ldr h10, [x15, #0x0]\n" + "ldr h14, [x25, #0x0]\n" + "ldr h18, [x24, #0x0]\n" + "b 117f\n" + "109:" // Height 3: Partial accumulate: partial_8_0 + "tbz x8, #3, 113f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "tbz x8, #2, 111f\n" + "ldr d9, [x15], #0x8\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "tbz x8, #1, 110f\n" + "ld1 { v9.s }[2], [x15], #0x4\n" + "mov x19, #0x1c\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v9.h }[6], [x15]\n" + "ld1 { v13.h }[6], [x25]\n" + "ld1 { v17.h }[6], [x24]\n" + "b 117f\n" + "110:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x8, #0, 117f\n" + "ld1 { v9.h }[4], [x15]\n" + "ld1 { v13.h }[4], [x25]\n" + "ld1 { v17.h }[4], [x24]\n" + "b 117f\n" + "111:" // Height 3: Partial accumulate: partial_2_8 + "tbz x8, #1, 112f\n" + "ldr s9, [x15], #0x4\n" + "ldr s13, [x25], #0x4\n" + "mov x19, #0x14\n" + "ldr s17, [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v9.h }[2], [x15]\n" + "ld1 { v13.h }[2], [x25]\n" + "ld1 { v17.h }[2], [x24]\n" + "b 117f\n" + "112:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x8, #0, 117f\n" + "ldr h9, [x15, #0x0]\n" + "ldr h13, [x25, #0x0]\n" + "ldr h17, [x24, #0x0]\n" + "b 117f\n" + "113:" // Height 3: Partial accumulate: partial_4_0 + "tbz x8, #2, 115f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "tbz x8, #1, 114f\n" + "ld1 { v8.s }[2], [x15], #0x4\n" + "mov x19, #0xc\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v8.h }[6], [x15]\n" + "ld1 { v12.h }[6], [x25]\n" + "ld1 { v16.h }[6], [x24]\n" + "b 117f\n" + "114:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x8, #0, 117f\n" + "ld1 { v8.h }[4], [x15]\n" + "ld1 { v12.h }[4], [x25]\n" + "ld1 { v16.h }[4], [x24]\n" + "b 117f\n" + "115:" // Height 3: Partial accumulate: partial_2_0 + "tbz x8, #1, 116f\n" + "ldr s8, [x15], #0x4\n" + "ldr s12, [x25], #0x4\n" + "mov x19, #0x4\n" + "ldr s16, [x24], #0x4\n" + "tbz x8, #0, 117f\n" + "ld1 { v8.h }[2], [x15]\n" + "ld1 { v12.h }[2], [x25]\n" + "ld1 { v16.h }[2], [x24]\n" + "b 117f\n" + "116:" // Height 3: Partial accumulate: partial_1_0 + "ldr h8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr h12, [x25, #0x0]\n" + "ldr h16, [x24, #0x0]\n" + "117:" // Height 3: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 120f\n" + "118:" // Height 3: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "b 120f\n" + "119:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "120:" // Height 3: setup done + "mov x14, #0x0\n" + "121:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 122f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x14, 123f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "b 123f\n" + "122:" // Height 3: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "123:" // Height 3: input setup done + "cmp x13, #0x8\n" + "blt 126f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x10\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 125f\n" + "124:" // Height 3: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr d6, [x17, #0x20]\n" + "ldr x10, [x17, #0x28]\n" + "add x12, x12, #0x10\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "ldr x11, [x17, #0x38]\n" + "add x28, x28, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x40]\n" + "add x26, x26, #0x10\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x50]\n" + "sub x13, x13, #0x8\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x60]\n" + "cmp x13, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr x10, [x17, #0x108]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr x11, [x17, #0x118]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr x10, [x17, #0x128]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr x11, [x17, #0x138]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr x10, [x17, #0x148]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr x11, [x17, #0x158]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr x10, [x17, #0x168]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr x11, [x17, #0x178]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr x10, [x17, #0x188]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr x11, [x17, #0x198]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr x10, [x17, #0x1a8]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr x11, [x17, #0x1b8]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr x10, [x17, #0x1c8]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr x11, [x17, #0x1d8]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr x10, [x17, #0x1e8]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr x11, [x17, #0x1f8]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x1f0]\n" + "add x17, x17, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "mov v0.d[1], x9\n" + "ldr d2, [x26, #0x0]\n" + "mov v1.d[1], x27\n" + "mov v2.d[1], x25\n" + "bge 124b\n" + "125:" // Height 3: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x13, x13, #0x8\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "ldr q7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "126:" // Height 3: Multiply loop: Main loop skip + "cbz x13, 128f\n" + "127:" // Height 3: Multiply loop: Odd block loop + "ldr h0, [x12], #0x2\n" + "sub x13, x13, #0x1\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "cbnz x13, 127b\n" + "128:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 121b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 129f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.8h }, [x20]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "129:" // Height 3: No activation + "cmp x8, #0x20\n" + "bge 146f\n" + "tbz x8, #4, 137f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v9.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v13.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v17.8h }, [x24], #0x10\n" + "tbz x8, #3, 133f\n" + "st1 { v10.8h }, [x15], #0x10\n" + "st1 { v14.8h }, [x25], #0x10\n" + "st1 { v18.8h }, [x24], #0x10\n" + "tbz x8, #2, 131f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "tbz x8, #1, 130f\n" + "st1 { v11.s }[2], [x15], #0x4\n" + "st1 { v15.s }[2], [x25], #0x4\n" + "st1 { v19.s }[2], [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v11.h }[6], [x15]\n" + "st1 { v15.h }[6], [x25]\n" + "st1 { v19.h }[6], [x24]\n" + "b 145f\n" + "130:" // Height 3: Partial direct writeback: partial_1_28 + "tbz x8, #0, 145f\n" + "st1 { v11.h }[4], [x15]\n" + "st1 { v15.h }[4], [x25]\n" + "st1 { v19.h }[4], [x24]\n" + "b 145f\n" + "131:" // Height 3: Partial direct writeback: partial_2_24 + "tbz x8, #1, 132f\n" + "str s11, [x15], #0x4\n" + "str s15, [x25], #0x4\n" + "str s19, [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v11.h }[2], [x15]\n" + "st1 { v15.h }[2], [x25]\n" + "st1 { v19.h }[2], [x24]\n" + "b 145f\n" + "132:" // Height 3: Partial direct writeback: partial_1_24 + "tbz x8, #0, 145f\n" + "str h11, [x15, #0x0]\n" + "str h15, [x25, #0x0]\n" + "str h19, [x24, #0x0]\n" + "b 145f\n" + "133:" // Height 3: Partial direct writeback: partial_4_16 + "tbz x8, #2, 135f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "tbz x8, #1, 134f\n" + "st1 { v10.s }[2], [x15], #0x4\n" + "st1 { v14.s }[2], [x25], #0x4\n" + "st1 { v18.s }[2], [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v10.h }[6], [x15]\n" + "st1 { v14.h }[6], [x25]\n" + "st1 { v18.h }[6], [x24]\n" + "b 145f\n" + "134:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x8, #0, 145f\n" + "st1 { v10.h }[4], [x15]\n" + "st1 { v14.h }[4], [x25]\n" + "st1 { v18.h }[4], [x24]\n" + "b 145f\n" + "135:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x8, #1, 136f\n" + "str s10, [x15], #0x4\n" + "str s14, [x25], #0x4\n" + "str s18, [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v10.h }[2], [x15]\n" + "st1 { v14.h }[2], [x25]\n" + "st1 { v18.h }[2], [x24]\n" + "b 145f\n" + "136:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x8, #0, 145f\n" + "str h10, [x15, #0x0]\n" + "str h14, [x25, #0x0]\n" + "str h18, [x24, #0x0]\n" + "b 145f\n" + "137:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x8, #3, 141f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "tbz x8, #2, 139f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "tbz x8, #1, 138f\n" + "st1 { v9.s }[2], [x15], #0x4\n" + "st1 { v13.s }[2], [x25], #0x4\n" + "st1 { v17.s }[2], [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v9.h }[6], [x15]\n" + "st1 { v13.h }[6], [x25]\n" + "st1 { v17.h }[6], [x24]\n" + "b 145f\n" + "138:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x8, #0, 145f\n" + "st1 { v9.h }[4], [x15]\n" + "st1 { v13.h }[4], [x25]\n" + "st1 { v17.h }[4], [x24]\n" + "b 145f\n" + "139:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x8, #1, 140f\n" + "str s9, [x15], #0x4\n" + "str s13, [x25], #0x4\n" + "str s17, [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v9.h }[2], [x15]\n" + "st1 { v13.h }[2], [x25]\n" + "st1 { v17.h }[2], [x24]\n" + "b 145f\n" + "140:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x8, #0, 145f\n" + "str h9, [x15, #0x0]\n" + "str h13, [x25, #0x0]\n" + "str h17, [x24, #0x0]\n" + "b 145f\n" + "141:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x8, #2, 143f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "tbz x8, #1, 142f\n" + "st1 { v8.s }[2], [x15], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v8.h }[6], [x15]\n" + "st1 { v12.h }[6], [x25]\n" + "st1 { v16.h }[6], [x24]\n" + "b 145f\n" + "142:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x8, #0, 145f\n" + "st1 { v8.h }[4], [x15]\n" + "st1 { v12.h }[4], [x25]\n" + "st1 { v16.h }[4], [x24]\n" + "b 145f\n" + "143:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x8, #1, 144f\n" + "str s8, [x15], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "tbz x8, #0, 145f\n" + "st1 { v8.h }[2], [x15]\n" + "st1 { v12.h }[2], [x25]\n" + "st1 { v16.h }[2], [x24]\n" + "b 145f\n" + "144:" // Height 3: Partial direct writeback: partial_1_0 + "str h8, [x15, #0x0]\n" + "str h12, [x25, #0x0]\n" + "str h16, [x24, #0x0]\n" + "145:" // Height 3: Partial direct writeback: Done + "b 147f\n" + "146:" // Height 3: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "147:" // Height 3: Writeback done + "subs x8, x8, #0x20\n" + "bgt 100b\n" + "b 296f\n" + "148:" // Height 4 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "149:" // Height 4: Column loop + "cbz x16, 150f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v11.16b\n" + "b 169f\n" + "150:" // Height 4: no bias + "tbz %x[flags], #0, 168f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x20\n" + "add x25, x15, x19, LSL #1\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "bge 167f\n" + "tbz x8, #4, 158f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v9.8h }, [x15], #0x10\n" + "ld1 { v13.8h }, [x25], #0x10\n" + "ld1 { v17.8h }, [x24], #0x10\n" + "ld1 { v20.8h }, [x23], #0x10\n" + "ld1 { v21.8h }, [x23], #0x10\n" + "tbz x8, #3, 154f\n" + "ld1 { v10.8h }, [x15], #0x10\n" + "ld1 { v14.8h }, [x25], #0x10\n" + "ld1 { v18.8h }, [x24], #0x10\n" + "ld1 { v22.8h }, [x23], #0x10\n" + "tbz x8, #2, 152f\n" + "ldr d11, [x15], #0x8\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "tbz x8, #1, 151f\n" + "ld1 { v11.s }[2], [x15], #0x4\n" + "mov x19, #0x3c\n" + "ld1 { v15.s }[2], [x25], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v11.h }[6], [x15]\n" + "ld1 { v15.h }[6], [x25]\n" + "ld1 { v19.h }[6], [x24]\n" + "ld1 { v23.h }[6], [x23]\n" + "b 166f\n" + "151:" // Height 4: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x8, #0, 166f\n" + "ld1 { v11.h }[4], [x15]\n" + "ld1 { v15.h }[4], [x25]\n" + "ld1 { v19.h }[4], [x24]\n" + "ld1 { v23.h }[4], [x23]\n" + "b 166f\n" + "152:" // Height 4: Partial accumulate: partial_2_24 + "tbz x8, #1, 153f\n" + "ldr s11, [x15], #0x4\n" + "ldr s15, [x25], #0x4\n" + "mov x19, #0x34\n" + "ldr s19, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v11.h }[2], [x15]\n" + "ld1 { v15.h }[2], [x25]\n" + "ld1 { v19.h }[2], [x24]\n" + "ld1 { v23.h }[2], [x23]\n" + "b 166f\n" + "153:" // Height 4: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x8, #0, 166f\n" + "ldr h11, [x15, #0x0]\n" + "ldr h15, [x25, #0x0]\n" + "ldr h19, [x24, #0x0]\n" + "ldr h23, [x23, #0x0]\n" + "b 166f\n" + "154:" // Height 4: Partial accumulate: partial_4_16 + "tbz x8, #2, 156f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "ldr d18, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "tbz x8, #1, 155f\n" + "ld1 { v10.s }[2], [x15], #0x4\n" + "mov x19, #0x2c\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "ld1 { v18.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v10.h }[6], [x15]\n" + "ld1 { v14.h }[6], [x25]\n" + "ld1 { v18.h }[6], [x24]\n" + "ld1 { v22.h }[6], [x23]\n" + "b 166f\n" + "155:" // Height 4: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x8, #0, 166f\n" + "ld1 { v10.h }[4], [x15]\n" + "ld1 { v14.h }[4], [x25]\n" + "ld1 { v18.h }[4], [x24]\n" + "ld1 { v22.h }[4], [x23]\n" + "b 166f\n" + "156:" // Height 4: Partial accumulate: partial_2_16 + "tbz x8, #1, 157f\n" + "ldr s10, [x15], #0x4\n" + "ldr s14, [x25], #0x4\n" + "mov x19, #0x24\n" + "ldr s18, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v10.h }[2], [x15]\n" + "ld1 { v14.h }[2], [x25]\n" + "ld1 { v18.h }[2], [x24]\n" + "ld1 { v22.h }[2], [x23]\n" + "b 166f\n" + "157:" // Height 4: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x8, #0, 166f\n" + "ldr h10, [x15, #0x0]\n" + "ldr h14, [x25, #0x0]\n" + "ldr h18, [x24, #0x0]\n" + "ldr h22, [x23, #0x0]\n" + "b 166f\n" + "158:" // Height 4: Partial accumulate: partial_8_0 + "tbz x8, #3, 162f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v20.8h }, [x23], #0x10\n" + "tbz x8, #2, 160f\n" + "ldr d9, [x15], #0x8\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "tbz x8, #1, 159f\n" + "ld1 { v9.s }[2], [x15], #0x4\n" + "mov x19, #0x1c\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "ld1 { v21.s }[2], [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v9.h }[6], [x15]\n" + "ld1 { v13.h }[6], [x25]\n" + "ld1 { v17.h }[6], [x24]\n" + "ld1 { v21.h }[6], [x23]\n" + "b 166f\n" + "159:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x8, #0, 166f\n" + "ld1 { v9.h }[4], [x15]\n" + "ld1 { v13.h }[4], [x25]\n" + "ld1 { v17.h }[4], [x24]\n" + "ld1 { v21.h }[4], [x23]\n" + "b 166f\n" + "160:" // Height 4: Partial accumulate: partial_2_8 + "tbz x8, #1, 161f\n" + "ldr s9, [x15], #0x4\n" + "ldr s13, [x25], #0x4\n" + "mov x19, #0x14\n" + "ldr s17, [x24], #0x4\n" + "ldr s21, [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v9.h }[2], [x15]\n" + "ld1 { v13.h }[2], [x25]\n" + "ld1 { v17.h }[2], [x24]\n" + "ld1 { v21.h }[2], [x23]\n" + "b 166f\n" + "161:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x8, #0, 166f\n" + "ldr h9, [x15, #0x0]\n" + "ldr h13, [x25, #0x0]\n" + "ldr h17, [x24, #0x0]\n" + "ldr h21, [x23, #0x0]\n" + "b 166f\n" + "162:" // Height 4: Partial accumulate: partial_4_0 + "tbz x8, #2, 164f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "tbz x8, #1, 163f\n" + "ld1 { v8.s }[2], [x15], #0x4\n" + "mov x19, #0xc\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v8.h }[6], [x15]\n" + "ld1 { v12.h }[6], [x25]\n" + "ld1 { v16.h }[6], [x24]\n" + "ld1 { v20.h }[6], [x23]\n" + "b 166f\n" + "163:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x8, #0, 166f\n" + "ld1 { v8.h }[4], [x15]\n" + "ld1 { v12.h }[4], [x25]\n" + "ld1 { v16.h }[4], [x24]\n" + "ld1 { v20.h }[4], [x23]\n" + "b 166f\n" + "164:" // Height 4: Partial accumulate: partial_2_0 + "tbz x8, #1, 165f\n" + "ldr s8, [x15], #0x4\n" + "ldr s12, [x25], #0x4\n" + "mov x19, #0x4\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "tbz x8, #0, 166f\n" + "ld1 { v8.h }[2], [x15]\n" + "ld1 { v12.h }[2], [x25]\n" + "ld1 { v16.h }[2], [x24]\n" + "ld1 { v20.h }[2], [x23]\n" + "b 166f\n" + "165:" // Height 4: Partial accumulate: partial_1_0 + "ldr h8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr h12, [x25, #0x0]\n" + "ldr h16, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "166:" // Height 4: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 169f\n" + "167:" // Height 4: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "b 169f\n" + "168:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "169:" // Height 4: setup done + "mov x14, #0x0\n" + "170:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 171f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x14, 172f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 172f\n" + "171:" // Height 4: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "172:" // Height 4: input setup done + "cmp x13, #0x8\n" + "blt 175f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x10\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 174f\n" + "173:" // Height 4: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr x10, [x17, #0x28]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr d6, [x17, #0x20]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x38]\n" + "add x12, x12, #0x10\n" + "add x28, x28, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr d6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "add x26, x26, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr d6, [x17, #0x60]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x70]\n" + "add x24, x24, #0x10\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "sub x13, x13, #0x8\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "cmp x13, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr x10, [x17, #0x108]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr d6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr x11, [x17, #0x118]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr d7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr x10, [x17, #0x128]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr d6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr x11, [x17, #0x138]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr d7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr x10, [x17, #0x148]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr d6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr x11, [x17, #0x158]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr d7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr x10, [x17, #0x168]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr d6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr x11, [x17, #0x178]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr d7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr x10, [x17, #0x188]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr d6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr x11, [x17, #0x198]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr d7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr x10, [x17, #0x1a8]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr d6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr x11, [x17, #0x1b8]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr d7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr x10, [x17, #0x1c8]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr d6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr x11, [x17, #0x1d8]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr d7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr x10, [x17, #0x1e8]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr d6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr x11, [x17, #0x1f8]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "ldr d7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "mov v0.d[1], x9\n" + "mov v1.d[1], x27\n" + "ldr d2, [x26, #0x0]\n" + "ldr d3, [x24, #0x0]\n" + "mov v2.d[1], x25\n" + "mov v3.d[1], x23\n" + "bge 173b\n" + "174:" // Height 4: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x13, x13, #0x8\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "ldr q7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "175:" // Height 4: Multiply loop: Main loop skip + "cbz x13, 177f\n" + "176:" // Height 4: Multiply loop: Odd block loop + "ldr h0, [x12], #0x2\n" + "sub x13, x13, #0x1\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "cbnz x13, 176b\n" + "177:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 170b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 178f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.8h }, [x20]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v0.8h\n" + "fmin v21.8h, v21.8h, v0.8h\n" + "fmin v22.8h, v22.8h, v0.8h\n" + "fmin v23.8h, v23.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v20.8h, v20.8h, v1.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v22.8h, v22.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "178:" // Height 4: No activation + "cmp x8, #0x20\n" + "bge 195f\n" + "tbz x8, #4, 186f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v9.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v13.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v17.8h }, [x24], #0x10\n" + "st1 { v20.8h }, [x23], #0x10\n" + "st1 { v21.8h }, [x23], #0x10\n" + "tbz x8, #3, 182f\n" + "st1 { v10.8h }, [x15], #0x10\n" + "st1 { v14.8h }, [x25], #0x10\n" + "st1 { v18.8h }, [x24], #0x10\n" + "st1 { v22.8h }, [x23], #0x10\n" + "tbz x8, #2, 180f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "tbz x8, #1, 179f\n" + "st1 { v11.s }[2], [x15], #0x4\n" + "st1 { v15.s }[2], [x25], #0x4\n" + "st1 { v19.s }[2], [x24], #0x4\n" + "st1 { v23.s }[2], [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v11.h }[6], [x15]\n" + "st1 { v15.h }[6], [x25]\n" + "st1 { v19.h }[6], [x24]\n" + "st1 { v23.h }[6], [x23]\n" + "b 194f\n" + "179:" // Height 4: Partial direct writeback: partial_1_28 + "tbz x8, #0, 194f\n" + "st1 { v11.h }[4], [x15]\n" + "st1 { v15.h }[4], [x25]\n" + "st1 { v19.h }[4], [x24]\n" + "st1 { v23.h }[4], [x23]\n" + "b 194f\n" + "180:" // Height 4: Partial direct writeback: partial_2_24 + "tbz x8, #1, 181f\n" + "str s11, [x15], #0x4\n" + "str s15, [x25], #0x4\n" + "str s19, [x24], #0x4\n" + "str s23, [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v11.h }[2], [x15]\n" + "st1 { v15.h }[2], [x25]\n" + "st1 { v19.h }[2], [x24]\n" + "st1 { v23.h }[2], [x23]\n" + "b 194f\n" + "181:" // Height 4: Partial direct writeback: partial_1_24 + "tbz x8, #0, 194f\n" + "str h11, [x15, #0x0]\n" + "str h15, [x25, #0x0]\n" + "str h19, [x24, #0x0]\n" + "str h23, [x23, #0x0]\n" + "b 194f\n" + "182:" // Height 4: Partial direct writeback: partial_4_16 + "tbz x8, #2, 184f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "tbz x8, #1, 183f\n" + "st1 { v10.s }[2], [x15], #0x4\n" + "st1 { v14.s }[2], [x25], #0x4\n" + "st1 { v18.s }[2], [x24], #0x4\n" + "st1 { v22.s }[2], [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v10.h }[6], [x15]\n" + "st1 { v14.h }[6], [x25]\n" + "st1 { v18.h }[6], [x24]\n" + "st1 { v22.h }[6], [x23]\n" + "b 194f\n" + "183:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x8, #0, 194f\n" + "st1 { v10.h }[4], [x15]\n" + "st1 { v14.h }[4], [x25]\n" + "st1 { v18.h }[4], [x24]\n" + "st1 { v22.h }[4], [x23]\n" + "b 194f\n" + "184:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x8, #1, 185f\n" + "str s10, [x15], #0x4\n" + "str s14, [x25], #0x4\n" + "str s18, [x24], #0x4\n" + "str s22, [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v10.h }[2], [x15]\n" + "st1 { v14.h }[2], [x25]\n" + "st1 { v18.h }[2], [x24]\n" + "st1 { v22.h }[2], [x23]\n" + "b 194f\n" + "185:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x8, #0, 194f\n" + "str h10, [x15, #0x0]\n" + "str h14, [x25, #0x0]\n" + "str h18, [x24, #0x0]\n" + "str h22, [x23, #0x0]\n" + "b 194f\n" + "186:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x8, #3, 190f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v20.8h }, [x23], #0x10\n" + "tbz x8, #2, 188f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "tbz x8, #1, 187f\n" + "st1 { v9.s }[2], [x15], #0x4\n" + "st1 { v13.s }[2], [x25], #0x4\n" + "st1 { v17.s }[2], [x24], #0x4\n" + "st1 { v21.s }[2], [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v9.h }[6], [x15]\n" + "st1 { v13.h }[6], [x25]\n" + "st1 { v17.h }[6], [x24]\n" + "st1 { v21.h }[6], [x23]\n" + "b 194f\n" + "187:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x8, #0, 194f\n" + "st1 { v9.h }[4], [x15]\n" + "st1 { v13.h }[4], [x25]\n" + "st1 { v17.h }[4], [x24]\n" + "st1 { v21.h }[4], [x23]\n" + "b 194f\n" + "188:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x8, #1, 189f\n" + "str s9, [x15], #0x4\n" + "str s13, [x25], #0x4\n" + "str s17, [x24], #0x4\n" + "str s21, [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v9.h }[2], [x15]\n" + "st1 { v13.h }[2], [x25]\n" + "st1 { v17.h }[2], [x24]\n" + "st1 { v21.h }[2], [x23]\n" + "b 194f\n" + "189:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x8, #0, 194f\n" + "str h9, [x15, #0x0]\n" + "str h13, [x25, #0x0]\n" + "str h17, [x24, #0x0]\n" + "str h21, [x23, #0x0]\n" + "b 194f\n" + "190:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x8, #2, 192f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "tbz x8, #1, 191f\n" + "st1 { v8.s }[2], [x15], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v8.h }[6], [x15]\n" + "st1 { v12.h }[6], [x25]\n" + "st1 { v16.h }[6], [x24]\n" + "st1 { v20.h }[6], [x23]\n" + "b 194f\n" + "191:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x8, #0, 194f\n" + "st1 { v8.h }[4], [x15]\n" + "st1 { v12.h }[4], [x25]\n" + "st1 { v16.h }[4], [x24]\n" + "st1 { v20.h }[4], [x23]\n" + "b 194f\n" + "192:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x8, #1, 193f\n" + "str s8, [x15], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "tbz x8, #0, 194f\n" + "st1 { v8.h }[2], [x15]\n" + "st1 { v12.h }[2], [x25]\n" + "st1 { v16.h }[2], [x24]\n" + "st1 { v20.h }[2], [x23]\n" + "b 194f\n" + "193:" // Height 4: Partial direct writeback: partial_1_0 + "str h8, [x15, #0x0]\n" + "str h12, [x25, #0x0]\n" + "str h16, [x24, #0x0]\n" + "str h20, [x23, #0x0]\n" + "194:" // Height 4: Partial direct writeback: Done + "b 196f\n" + "195:" // Height 4: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q20, [x23, #0x0]\n" + "str q21, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q23, [x23, #0x30]\n" + "196:" // Height 4: Writeback done + "subs x8, x8, #0x20\n" + "bgt 149b\n" + "b 296f\n" + "197:" // Height 5 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "198:" // Height 5: Column loop + "cbz x16, 199f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v24.16b, v8.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v11.16b\n" + "mov v27.16b, v11.16b\n" + "b 218f\n" + "199:" // Height 5: no bias + "tbz %x[flags], #0, 217f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x20\n" + "add x25, x15, x19, LSL #1\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "bge 216f\n" + "tbz x8, #4, 207f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v9.8h }, [x15], #0x10\n" + "ld1 { v13.8h }, [x25], #0x10\n" + "ld1 { v17.8h }, [x24], #0x10\n" + "ld1 { v20.8h }, [x23], #0x10\n" + "ld1 { v24.8h }, [x22], #0x10\n" + "ld1 { v21.8h }, [x23], #0x10\n" + "ld1 { v25.8h }, [x22], #0x10\n" + "tbz x8, #3, 203f\n" + "ld1 { v10.8h }, [x15], #0x10\n" + "ld1 { v14.8h }, [x25], #0x10\n" + "ld1 { v18.8h }, [x24], #0x10\n" + "ld1 { v22.8h }, [x23], #0x10\n" + "ld1 { v26.8h }, [x22], #0x10\n" + "tbz x8, #2, 201f\n" + "ldr d11, [x15], #0x8\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "tbz x8, #1, 200f\n" + "ld1 { v11.s }[2], [x15], #0x4\n" + "mov x19, #0x3c\n" + "ld1 { v15.s }[2], [x25], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" + "ld1 { v27.s }[2], [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v11.h }[6], [x15]\n" + "ld1 { v15.h }[6], [x25]\n" + "ld1 { v19.h }[6], [x24]\n" + "ld1 { v23.h }[6], [x23]\n" + "ld1 { v27.h }[6], [x22]\n" + "b 215f\n" + "200:" // Height 5: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x8, #0, 215f\n" + "ld1 { v11.h }[4], [x15]\n" + "ld1 { v15.h }[4], [x25]\n" + "ld1 { v19.h }[4], [x24]\n" + "ld1 { v23.h }[4], [x23]\n" + "ld1 { v27.h }[4], [x22]\n" + "b 215f\n" + "201:" // Height 5: Partial accumulate: partial_2_24 + "tbz x8, #1, 202f\n" + "ldr s11, [x15], #0x4\n" + "ldr s15, [x25], #0x4\n" + "mov x19, #0x34\n" + "ldr s19, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s27, [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v11.h }[2], [x15]\n" + "ld1 { v15.h }[2], [x25]\n" + "ld1 { v19.h }[2], [x24]\n" + "ld1 { v23.h }[2], [x23]\n" + "ld1 { v27.h }[2], [x22]\n" + "b 215f\n" + "202:" // Height 5: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x8, #0, 215f\n" + "ldr h11, [x15, #0x0]\n" + "ldr h15, [x25, #0x0]\n" + "ldr h19, [x24, #0x0]\n" + "ldr h23, [x23, #0x0]\n" + "ldr h27, [x22, #0x0]\n" + "b 215f\n" + "203:" // Height 5: Partial accumulate: partial_4_16 + "tbz x8, #2, 205f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "ldr d18, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" + "tbz x8, #1, 204f\n" + "ld1 { v10.s }[2], [x15], #0x4\n" + "mov x19, #0x2c\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "ld1 { v18.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v26.s }[2], [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v10.h }[6], [x15]\n" + "ld1 { v14.h }[6], [x25]\n" + "ld1 { v18.h }[6], [x24]\n" + "ld1 { v22.h }[6], [x23]\n" + "ld1 { v26.h }[6], [x22]\n" + "b 215f\n" + "204:" // Height 5: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x8, #0, 215f\n" + "ld1 { v10.h }[4], [x15]\n" + "ld1 { v14.h }[4], [x25]\n" + "ld1 { v18.h }[4], [x24]\n" + "ld1 { v22.h }[4], [x23]\n" + "ld1 { v26.h }[4], [x22]\n" + "b 215f\n" + "205:" // Height 5: Partial accumulate: partial_2_16 + "tbz x8, #1, 206f\n" + "ldr s10, [x15], #0x4\n" + "ldr s14, [x25], #0x4\n" + "mov x19, #0x24\n" + "ldr s18, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v10.h }[2], [x15]\n" + "ld1 { v14.h }[2], [x25]\n" + "ld1 { v18.h }[2], [x24]\n" + "ld1 { v22.h }[2], [x23]\n" + "ld1 { v26.h }[2], [x22]\n" + "b 215f\n" + "206:" // Height 5: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x8, #0, 215f\n" + "ldr h10, [x15, #0x0]\n" + "ldr h14, [x25, #0x0]\n" + "ldr h18, [x24, #0x0]\n" + "ldr h22, [x23, #0x0]\n" + "ldr h26, [x22, #0x0]\n" + "b 215f\n" + "207:" // Height 5: Partial accumulate: partial_8_0 + "tbz x8, #3, 211f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v20.8h }, [x23], #0x10\n" + "ld1 { v24.8h }, [x22], #0x10\n" + "tbz x8, #2, 209f\n" + "ldr d9, [x15], #0x8\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "tbz x8, #1, 208f\n" + "ld1 { v9.s }[2], [x15], #0x4\n" + "mov x19, #0x1c\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "ld1 { v21.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v9.h }[6], [x15]\n" + "ld1 { v13.h }[6], [x25]\n" + "ld1 { v17.h }[6], [x24]\n" + "ld1 { v21.h }[6], [x23]\n" + "ld1 { v25.h }[6], [x22]\n" + "b 215f\n" + "208:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x8, #0, 215f\n" + "ld1 { v9.h }[4], [x15]\n" + "ld1 { v13.h }[4], [x25]\n" + "ld1 { v17.h }[4], [x24]\n" + "ld1 { v21.h }[4], [x23]\n" + "ld1 { v25.h }[4], [x22]\n" + "b 215f\n" + "209:" // Height 5: Partial accumulate: partial_2_8 + "tbz x8, #1, 210f\n" + "ldr s9, [x15], #0x4\n" + "ldr s13, [x25], #0x4\n" + "mov x19, #0x14\n" + "ldr s17, [x24], #0x4\n" + "ldr s21, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v9.h }[2], [x15]\n" + "ld1 { v13.h }[2], [x25]\n" + "ld1 { v17.h }[2], [x24]\n" + "ld1 { v21.h }[2], [x23]\n" + "ld1 { v25.h }[2], [x22]\n" + "b 215f\n" + "210:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x8, #0, 215f\n" + "ldr h9, [x15, #0x0]\n" + "ldr h13, [x25, #0x0]\n" + "ldr h17, [x24, #0x0]\n" + "ldr h21, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "b 215f\n" + "211:" // Height 5: Partial accumulate: partial_4_0 + "tbz x8, #2, 213f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "tbz x8, #1, 212f\n" + "ld1 { v8.s }[2], [x15], #0x4\n" + "mov x19, #0xc\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v8.h }[6], [x15]\n" + "ld1 { v12.h }[6], [x25]\n" + "ld1 { v16.h }[6], [x24]\n" + "ld1 { v20.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "b 215f\n" + "212:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x8, #0, 215f\n" + "ld1 { v8.h }[4], [x15]\n" + "ld1 { v12.h }[4], [x25]\n" + "ld1 { v16.h }[4], [x24]\n" + "ld1 { v20.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "b 215f\n" + "213:" // Height 5: Partial accumulate: partial_2_0 + "tbz x8, #1, 214f\n" + "ldr s8, [x15], #0x4\n" + "ldr s12, [x25], #0x4\n" + "mov x19, #0x4\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v8.h }[2], [x15]\n" + "ld1 { v12.h }[2], [x25]\n" + "ld1 { v16.h }[2], [x24]\n" + "ld1 { v20.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "b 215f\n" + "214:" // Height 5: Partial accumulate: partial_1_0 + "ldr h8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr h12, [x25, #0x0]\n" + "ldr h16, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "215:" // Height 5: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 218f\n" + "216:" // Height 5: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "b 218f\n" + "217:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "218:" // Height 5: setup done + "mov x14, #0x0\n" + "219:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 220f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x14, 221f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 221f\n" + "220:" // Height 5: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "221:" // Height 5: input setup done + "cmp x13, #0x8\n" + "blt 224f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x10\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 223f\n" + "222:" // Height 5: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr x10, [x17, #0x28]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr d6, [x17, #0x20]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr d6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "add x24, x24, #0x10\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr d6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "add x22, x22, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr x21, [x22, #0x8]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "sub x13, x13, #0x8\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "cmp x13, #0x10\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr x10, [x17, #0x108]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr x11, [x17, #0x118]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "ldr d6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr x10, [x17, #0x128]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "ldr d7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr x11, [x17, #0x138]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "ldr d6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr x10, [x17, #0x148]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "ldr d7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr x11, [x17, #0x158]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "ldr d6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr x10, [x17, #0x168]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "ldr d7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr x11, [x17, #0x178]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "ldr d6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr x10, [x17, #0x188]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "ldr d7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr x11, [x17, #0x198]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "ldr d6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr x10, [x17, #0x1a8]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "ldr d7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr x11, [x17, #0x1b8]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "ldr d6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr x10, [x17, #0x1c8]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "ldr d7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr x11, [x17, #0x1d8]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "ldr d6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr x10, [x17, #0x1e8]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "ldr d7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr x11, [x17, #0x1f8]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "ldr d6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "ldr d7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "mov v0.d[1], x9\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "mov v1.d[1], x27\n" + "ldr d2, [x26, #0x0]\n" + "ldr d3, [x24, #0x0]\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x25\n" + "mov v3.d[1], x23\n" + "mov v4.d[1], x21\n" + "bge 222b\n" + "223:" // Height 5: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x13, x13, #0x8\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "add x22, x22, #0x10\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "ldr q7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "224:" // Height 5: Multiply loop: Main loop skip + "cbz x13, 226f\n" + "225:" // Height 5: Multiply loop: Odd block loop + "ldr h0, [x12], #0x2\n" + "sub x13, x13, #0x1\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "cbnz x13, 225b\n" + "226:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 219b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #1\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 227f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.8h }, [x20]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v0.8h\n" + "fmin v21.8h, v21.8h, v0.8h\n" + "fmin v22.8h, v22.8h, v0.8h\n" + "fmin v23.8h, v23.8h, v0.8h\n" + "fmin v24.8h, v24.8h, v0.8h\n" + "fmin v25.8h, v25.8h, v0.8h\n" + "fmin v26.8h, v26.8h, v0.8h\n" + "fmin v27.8h, v27.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v20.8h, v20.8h, v1.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v22.8h, v22.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "fmax v24.8h, v24.8h, v1.8h\n" + "fmax v25.8h, v25.8h, v1.8h\n" + "fmax v26.8h, v26.8h, v1.8h\n" + "fmax v27.8h, v27.8h, v1.8h\n" + "227:" // Height 5: No activation + "cmp x8, #0x20\n" + "bge 244f\n" + "tbz x8, #4, 235f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v9.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v13.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v17.8h }, [x24], #0x10\n" + "st1 { v20.8h }, [x23], #0x10\n" + "st1 { v21.8h }, [x23], #0x10\n" + "st1 { v24.8h }, [x22], #0x10\n" + "st1 { v25.8h }, [x22], #0x10\n" + "tbz x8, #3, 231f\n" + "st1 { v10.8h }, [x15], #0x10\n" + "st1 { v14.8h }, [x25], #0x10\n" + "st1 { v18.8h }, [x24], #0x10\n" + "st1 { v22.8h }, [x23], #0x10\n" + "st1 { v26.8h }, [x22], #0x10\n" + "tbz x8, #2, 229f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "str d27, [x22], #0x8\n" + "tbz x8, #1, 228f\n" + "st1 { v11.s }[2], [x15], #0x4\n" + "st1 { v15.s }[2], [x25], #0x4\n" + "st1 { v19.s }[2], [x24], #0x4\n" + "st1 { v23.s }[2], [x23], #0x4\n" + "st1 { v27.s }[2], [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v11.h }[6], [x15]\n" + "st1 { v15.h }[6], [x25]\n" + "st1 { v19.h }[6], [x24]\n" + "st1 { v23.h }[6], [x23]\n" + "st1 { v27.h }[6], [x22]\n" + "b 243f\n" + "228:" // Height 5: Partial direct writeback: partial_1_28 + "tbz x8, #0, 243f\n" + "st1 { v11.h }[4], [x15]\n" + "st1 { v15.h }[4], [x25]\n" + "st1 { v19.h }[4], [x24]\n" + "st1 { v23.h }[4], [x23]\n" + "st1 { v27.h }[4], [x22]\n" + "b 243f\n" + "229:" // Height 5: Partial direct writeback: partial_2_24 + "tbz x8, #1, 230f\n" + "str s11, [x15], #0x4\n" + "str s15, [x25], #0x4\n" + "str s19, [x24], #0x4\n" + "str s23, [x23], #0x4\n" + "str s27, [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v11.h }[2], [x15]\n" + "st1 { v15.h }[2], [x25]\n" + "st1 { v19.h }[2], [x24]\n" + "st1 { v23.h }[2], [x23]\n" + "st1 { v27.h }[2], [x22]\n" + "b 243f\n" + "230:" // Height 5: Partial direct writeback: partial_1_24 + "tbz x8, #0, 243f\n" + "str h11, [x15, #0x0]\n" + "str h15, [x25, #0x0]\n" + "str h19, [x24, #0x0]\n" + "str h23, [x23, #0x0]\n" + "str h27, [x22, #0x0]\n" + "b 243f\n" + "231:" // Height 5: Partial direct writeback: partial_4_16 + "tbz x8, #2, 233f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d26, [x22], #0x8\n" + "tbz x8, #1, 232f\n" + "st1 { v10.s }[2], [x15], #0x4\n" + "st1 { v14.s }[2], [x25], #0x4\n" + "st1 { v18.s }[2], [x24], #0x4\n" + "st1 { v22.s }[2], [x23], #0x4\n" + "st1 { v26.s }[2], [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v10.h }[6], [x15]\n" + "st1 { v14.h }[6], [x25]\n" + "st1 { v18.h }[6], [x24]\n" + "st1 { v22.h }[6], [x23]\n" + "st1 { v26.h }[6], [x22]\n" + "b 243f\n" + "232:" // Height 5: Partial direct writeback: partial_1_20 + "tbz x8, #0, 243f\n" + "st1 { v10.h }[4], [x15]\n" + "st1 { v14.h }[4], [x25]\n" + "st1 { v18.h }[4], [x24]\n" + "st1 { v22.h }[4], [x23]\n" + "st1 { v26.h }[4], [x22]\n" + "b 243f\n" + "233:" // Height 5: Partial direct writeback: partial_2_16 + "tbz x8, #1, 234f\n" + "str s10, [x15], #0x4\n" + "str s14, [x25], #0x4\n" + "str s18, [x24], #0x4\n" + "str s22, [x23], #0x4\n" + "str s26, [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v10.h }[2], [x15]\n" + "st1 { v14.h }[2], [x25]\n" + "st1 { v18.h }[2], [x24]\n" + "st1 { v22.h }[2], [x23]\n" + "st1 { v26.h }[2], [x22]\n" + "b 243f\n" + "234:" // Height 5: Partial direct writeback: partial_1_16 + "tbz x8, #0, 243f\n" + "str h10, [x15, #0x0]\n" + "str h14, [x25, #0x0]\n" + "str h18, [x24, #0x0]\n" + "str h22, [x23, #0x0]\n" + "str h26, [x22, #0x0]\n" + "b 243f\n" + "235:" // Height 5: Partial direct writeback: partial_8_0 + "tbz x8, #3, 239f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v20.8h }, [x23], #0x10\n" + "st1 { v24.8h }, [x22], #0x10\n" + "tbz x8, #2, 237f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "tbz x8, #1, 236f\n" + "st1 { v9.s }[2], [x15], #0x4\n" + "st1 { v13.s }[2], [x25], #0x4\n" + "st1 { v17.s }[2], [x24], #0x4\n" + "st1 { v21.s }[2], [x23], #0x4\n" + "st1 { v25.s }[2], [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v9.h }[6], [x15]\n" + "st1 { v13.h }[6], [x25]\n" + "st1 { v17.h }[6], [x24]\n" + "st1 { v21.h }[6], [x23]\n" + "st1 { v25.h }[6], [x22]\n" + "b 243f\n" + "236:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x8, #0, 243f\n" + "st1 { v9.h }[4], [x15]\n" + "st1 { v13.h }[4], [x25]\n" + "st1 { v17.h }[4], [x24]\n" + "st1 { v21.h }[4], [x23]\n" + "st1 { v25.h }[4], [x22]\n" + "b 243f\n" + "237:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x8, #1, 238f\n" + "str s9, [x15], #0x4\n" + "str s13, [x25], #0x4\n" + "str s17, [x24], #0x4\n" + "str s21, [x23], #0x4\n" + "str s25, [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v9.h }[2], [x15]\n" + "st1 { v13.h }[2], [x25]\n" + "st1 { v17.h }[2], [x24]\n" + "st1 { v21.h }[2], [x23]\n" + "st1 { v25.h }[2], [x22]\n" + "b 243f\n" + "238:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x8, #0, 243f\n" + "str h9, [x15, #0x0]\n" + "str h13, [x25, #0x0]\n" + "str h17, [x24, #0x0]\n" + "str h21, [x23, #0x0]\n" + "str h25, [x22, #0x0]\n" + "b 243f\n" + "239:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x8, #2, 241f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "tbz x8, #1, 240f\n" + "st1 { v8.s }[2], [x15], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v8.h }[6], [x15]\n" + "st1 { v12.h }[6], [x25]\n" + "st1 { v16.h }[6], [x24]\n" + "st1 { v20.h }[6], [x23]\n" + "st1 { v24.h }[6], [x22]\n" + "b 243f\n" + "240:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x8, #0, 243f\n" + "st1 { v8.h }[4], [x15]\n" + "st1 { v12.h }[4], [x25]\n" + "st1 { v16.h }[4], [x24]\n" + "st1 { v20.h }[4], [x23]\n" + "st1 { v24.h }[4], [x22]\n" + "b 243f\n" + "241:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x8, #1, 242f\n" + "str s8, [x15], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "tbz x8, #0, 243f\n" + "st1 { v8.h }[2], [x15]\n" + "st1 { v12.h }[2], [x25]\n" + "st1 { v16.h }[2], [x24]\n" + "st1 { v20.h }[2], [x23]\n" + "st1 { v24.h }[2], [x22]\n" + "b 243f\n" + "242:" // Height 5: Partial direct writeback: partial_1_0 + "str h8, [x15, #0x0]\n" + "str h12, [x25, #0x0]\n" + "str h16, [x24, #0x0]\n" + "str h20, [x23, #0x0]\n" + "str h24, [x22, #0x0]\n" + "243:" // Height 5: Partial direct writeback: Done + "b 245f\n" + "244:" // Height 5: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q20, [x23, #0x0]\n" + "str q21, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q23, [x23, #0x30]\n" + "str q24, [x22, #0x0]\n" + "str q25, [x22, #0x10]\n" + "str q26, [x22, #0x20]\n" + "str q27, [x22, #0x30]\n" + "245:" // Height 5: Writeback done + "subs x8, x8, #0x20\n" + "bgt 198b\n" + "b 296f\n" + "246:" // Height 6 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0xc\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "247:" // Height 6: Column loop + "cbz x16, 248f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v24.16b, v8.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v28.16b, v8.16b\n" + "mov v29.16b, v9.16b\n" + "mov v30.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v11.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v11.16b\n" + "b 267f\n" + "248:" // Height 6: no bias + "tbz %x[flags], #0, 266f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x20\n" + "add x25, x15, x19, LSL #1\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "bge 265f\n" + "tbz x8, #4, 256f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v9.8h }, [x15], #0x10\n" + "ld1 { v13.8h }, [x25], #0x10\n" + "ld1 { v17.8h }, [x24], #0x10\n" + "ld1 { v20.8h }, [x23], #0x10\n" + "ld1 { v24.8h }, [x22], #0x10\n" + "ld1 { v28.8h }, [x21], #0x10\n" + "ld1 { v21.8h }, [x23], #0x10\n" + "ld1 { v25.8h }, [x22], #0x10\n" + "ld1 { v29.8h }, [x21], #0x10\n" + "tbz x8, #3, 252f\n" + "ld1 { v10.8h }, [x15], #0x10\n" + "ld1 { v14.8h }, [x25], #0x10\n" + "ld1 { v18.8h }, [x24], #0x10\n" + "ld1 { v22.8h }, [x23], #0x10\n" + "ld1 { v26.8h }, [x22], #0x10\n" + "ld1 { v30.8h }, [x21], #0x10\n" + "tbz x8, #2, 250f\n" + "ldr d11, [x15], #0x8\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x8, #1, 249f\n" + "ld1 { v11.s }[2], [x15], #0x4\n" + "mov x19, #0x3c\n" + "ld1 { v15.s }[2], [x25], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" + "ld1 { v27.s }[2], [x22], #0x4\n" + "ld1 { v31.s }[2], [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v11.h }[6], [x15]\n" + "ld1 { v15.h }[6], [x25]\n" + "ld1 { v19.h }[6], [x24]\n" + "ld1 { v23.h }[6], [x23]\n" + "ld1 { v27.h }[6], [x22]\n" + "ld1 { v31.h }[6], [x21]\n" + "b 264f\n" + "249:" // Height 6: Partial accumulate: partial_1_28 + "mov x19, #0x38\n" + "tbz x8, #0, 264f\n" + "ld1 { v11.h }[4], [x15]\n" + "ld1 { v15.h }[4], [x25]\n" + "ld1 { v19.h }[4], [x24]\n" + "ld1 { v23.h }[4], [x23]\n" + "ld1 { v27.h }[4], [x22]\n" + "ld1 { v31.h }[4], [x21]\n" + "b 264f\n" + "250:" // Height 6: Partial accumulate: partial_2_24 + "tbz x8, #1, 251f\n" + "ldr s11, [x15], #0x4\n" + "ldr s15, [x25], #0x4\n" + "mov x19, #0x34\n" + "ldr s19, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s27, [x22], #0x4\n" + "ldr s31, [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v11.h }[2], [x15]\n" + "ld1 { v15.h }[2], [x25]\n" + "ld1 { v19.h }[2], [x24]\n" + "ld1 { v23.h }[2], [x23]\n" + "ld1 { v27.h }[2], [x22]\n" + "ld1 { v31.h }[2], [x21]\n" + "b 264f\n" + "251:" // Height 6: Partial accumulate: partial_1_24 + "mov x19, #0x30\n" + "tbz x8, #0, 264f\n" + "ldr h11, [x15, #0x0]\n" + "ldr h15, [x25, #0x0]\n" + "ldr h19, [x24, #0x0]\n" + "ldr h23, [x23, #0x0]\n" + "ldr h27, [x22, #0x0]\n" + "ldr h31, [x21, #0x0]\n" + "b 264f\n" + "252:" // Height 6: Partial accumulate: partial_4_16 + "tbz x8, #2, 254f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "ldr d18, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" + "ldr d30, [x21], #0x8\n" + "tbz x8, #1, 253f\n" + "ld1 { v10.s }[2], [x15], #0x4\n" + "mov x19, #0x2c\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "ld1 { v18.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v26.s }[2], [x22], #0x4\n" + "ld1 { v30.s }[2], [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v10.h }[6], [x15]\n" + "ld1 { v14.h }[6], [x25]\n" + "ld1 { v18.h }[6], [x24]\n" + "ld1 { v22.h }[6], [x23]\n" + "ld1 { v26.h }[6], [x22]\n" + "ld1 { v30.h }[6], [x21]\n" + "b 264f\n" + "253:" // Height 6: Partial accumulate: partial_1_20 + "mov x19, #0x28\n" + "tbz x8, #0, 264f\n" + "ld1 { v10.h }[4], [x15]\n" + "ld1 { v14.h }[4], [x25]\n" + "ld1 { v18.h }[4], [x24]\n" + "ld1 { v22.h }[4], [x23]\n" + "ld1 { v26.h }[4], [x22]\n" + "ld1 { v30.h }[4], [x21]\n" + "b 264f\n" + "254:" // Height 6: Partial accumulate: partial_2_16 + "tbz x8, #1, 255f\n" + "ldr s10, [x15], #0x4\n" + "ldr s14, [x25], #0x4\n" + "mov x19, #0x24\n" + "ldr s18, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s26, [x22], #0x4\n" + "ldr s30, [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v10.h }[2], [x15]\n" + "ld1 { v14.h }[2], [x25]\n" + "ld1 { v18.h }[2], [x24]\n" + "ld1 { v22.h }[2], [x23]\n" + "ld1 { v26.h }[2], [x22]\n" + "ld1 { v30.h }[2], [x21]\n" + "b 264f\n" + "255:" // Height 6: Partial accumulate: partial_1_16 + "mov x19, #0x20\n" + "tbz x8, #0, 264f\n" + "ldr h10, [x15, #0x0]\n" + "ldr h14, [x25, #0x0]\n" + "ldr h18, [x24, #0x0]\n" + "ldr h22, [x23, #0x0]\n" + "ldr h26, [x22, #0x0]\n" + "ldr h30, [x21, #0x0]\n" + "b 264f\n" + "256:" // Height 6: Partial accumulate: partial_8_0 + "tbz x8, #3, 260f\n" + "ld1 { v8.8h }, [x15], #0x10\n" + "ld1 { v12.8h }, [x25], #0x10\n" + "ld1 { v16.8h }, [x24], #0x10\n" + "ld1 { v20.8h }, [x23], #0x10\n" + "ld1 { v24.8h }, [x22], #0x10\n" + "ld1 { v28.8h }, [x21], #0x10\n" + "tbz x8, #2, 258f\n" + "ldr d9, [x15], #0x8\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x8, #1, 257f\n" + "ld1 { v9.s }[2], [x15], #0x4\n" + "mov x19, #0x1c\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "ld1 { v21.s }[2], [x23], #0x4\n" + "ld1 { v25.s }[2], [x22], #0x4\n" + "ld1 { v29.s }[2], [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v9.h }[6], [x15]\n" + "ld1 { v13.h }[6], [x25]\n" + "ld1 { v17.h }[6], [x24]\n" + "ld1 { v21.h }[6], [x23]\n" + "ld1 { v25.h }[6], [x22]\n" + "ld1 { v29.h }[6], [x21]\n" + "b 264f\n" + "257:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x18\n" + "tbz x8, #0, 264f\n" + "ld1 { v9.h }[4], [x15]\n" + "ld1 { v13.h }[4], [x25]\n" + "ld1 { v17.h }[4], [x24]\n" + "ld1 { v21.h }[4], [x23]\n" + "ld1 { v25.h }[4], [x22]\n" + "ld1 { v29.h }[4], [x21]\n" + "b 264f\n" + "258:" // Height 6: Partial accumulate: partial_2_8 + "tbz x8, #1, 259f\n" + "ldr s9, [x15], #0x4\n" + "ldr s13, [x25], #0x4\n" + "mov x19, #0x14\n" + "ldr s17, [x24], #0x4\n" + "ldr s21, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s29, [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v9.h }[2], [x15]\n" + "ld1 { v13.h }[2], [x25]\n" + "ld1 { v17.h }[2], [x24]\n" + "ld1 { v21.h }[2], [x23]\n" + "ld1 { v25.h }[2], [x22]\n" + "ld1 { v29.h }[2], [x21]\n" + "b 264f\n" + "259:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x10\n" + "tbz x8, #0, 264f\n" + "ldr h9, [x15, #0x0]\n" + "ldr h13, [x25, #0x0]\n" + "ldr h17, [x24, #0x0]\n" + "ldr h21, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "ldr h29, [x21, #0x0]\n" + "b 264f\n" + "260:" // Height 6: Partial accumulate: partial_4_0 + "tbz x8, #2, 262f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d28, [x21], #0x8\n" + "tbz x8, #1, 261f\n" + "ld1 { v8.s }[2], [x15], #0x4\n" + "mov x19, #0xc\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v28.s }[2], [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v8.h }[6], [x15]\n" + "ld1 { v12.h }[6], [x25]\n" + "ld1 { v16.h }[6], [x24]\n" + "ld1 { v20.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "ld1 { v28.h }[6], [x21]\n" + "b 264f\n" + "261:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x8\n" + "tbz x8, #0, 264f\n" + "ld1 { v8.h }[4], [x15]\n" + "ld1 { v12.h }[4], [x25]\n" + "ld1 { v16.h }[4], [x24]\n" + "ld1 { v20.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "ld1 { v28.h }[4], [x21]\n" + "b 264f\n" + "262:" // Height 6: Partial accumulate: partial_2_0 + "tbz x8, #1, 263f\n" + "ldr s8, [x15], #0x4\n" + "ldr s12, [x25], #0x4\n" + "mov x19, #0x4\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s28, [x21], #0x4\n" + "tbz x8, #0, 264f\n" + "ld1 { v8.h }[2], [x15]\n" + "ld1 { v12.h }[2], [x25]\n" + "ld1 { v16.h }[2], [x24]\n" + "ld1 { v20.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v28.h }[2], [x21]\n" + "b 264f\n" + "263:" // Height 6: Partial accumulate: partial_1_0 + "ldr h8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr h12, [x25, #0x0]\n" + "ldr h16, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h28, [x21, #0x0]\n" + "264:" // Height 6: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 267f\n" + "265:" // Height 6: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 267f\n" + "266:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "267:" // Height 6: setup done + "mov x14, #0x0\n" + "268:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 269f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x14, 270f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #1\n" + "add x28, x28, x19, LSL #1\n" + "add x26, x26, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "add x20, x20, x19, LSL #1\n" + "b 270f\n" + "269:" // Height 6: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #1\n" + "add x26, x28, x19, LSL #1\n" + "add x24, x26, x19, LSL #1\n" + "add x22, x24, x19, LSL #1\n" + "add x20, x22, x19, LSL #1\n" + "270:" // Height 6: input setup done + "cmp x13, #0x8\n" + "blt 273f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x10\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 272f\n" + "271:" // Height 6: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr x10, [x17, #0x28]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "mov v7.d[1], x11\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr d6, [x17, #0x20]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr d6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "add x24, x24, #0x10\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v28.8h, v6.8h, v5.h[1]\n" + "ldr d6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "add x22, x22, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr x21, [x22, #0x8]\n" + "fmla v29.8h, v7.8h, v5.h[1]\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "add x20, x20, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr x19, [x20, #0x8]\n" + "fmla v30.8h, v6.8h, v5.h[1]\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "sub x13, x13, #0x8\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "cmp x13, #0x10\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "fmla v31.8h, v7.8h, v5.h[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "fmla v28.8h, v6.8h, v5.h[2]\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "fmla v29.8h, v7.8h, v5.h[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "fmla v30.8h, v6.8h, v5.h[2]\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "fmla v31.8h, v7.8h, v5.h[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "fmla v28.8h, v6.8h, v5.h[3]\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr x10, [x17, #0x108]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "fmla v29.8h, v7.8h, v5.h[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr x11, [x17, #0x118]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "fmla v30.8h, v6.8h, v5.h[3]\n" + "ldr d6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr x10, [x17, #0x128]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "fmla v31.8h, v7.8h, v5.h[3]\n" + "ldr d7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr x11, [x17, #0x138]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "fmla v28.8h, v6.8h, v5.h[4]\n" + "ldr d6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr x10, [x17, #0x148]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "fmla v29.8h, v7.8h, v5.h[4]\n" + "ldr d7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr x11, [x17, #0x158]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "fmla v30.8h, v6.8h, v5.h[4]\n" + "ldr d6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr x10, [x17, #0x168]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "fmla v31.8h, v7.8h, v5.h[4]\n" + "ldr d7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr x11, [x17, #0x178]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "fmla v28.8h, v6.8h, v5.h[5]\n" + "ldr d6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr x10, [x17, #0x188]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "fmla v29.8h, v7.8h, v5.h[5]\n" + "ldr d7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr x11, [x17, #0x198]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "fmla v30.8h, v6.8h, v5.h[5]\n" + "ldr d6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr x10, [x17, #0x1a8]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "fmla v31.8h, v7.8h, v5.h[5]\n" + "ldr d7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr x11, [x17, #0x1b8]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "fmla v28.8h, v6.8h, v5.h[6]\n" + "ldr d6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr x10, [x17, #0x1c8]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "fmla v29.8h, v7.8h, v5.h[6]\n" + "ldr d7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr x11, [x17, #0x1d8]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "fmla v30.8h, v6.8h, v5.h[6]\n" + "ldr d6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr x10, [x17, #0x1e8]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "fmla v31.8h, v7.8h, v5.h[6]\n" + "ldr d7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr x11, [x17, #0x1f8]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "fmla v28.8h, v6.8h, v5.h[7]\n" + "ldr d6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "fmla v29.8h, v7.8h, v5.h[7]\n" + "ldr d7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "mov v7.d[1], x11\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "mov v6.d[1], x10\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "mov v0.d[1], x9\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "mov v1.d[1], x27\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "ldr d2, [x26, #0x0]\n" + "ldr d3, [x24, #0x0]\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x25\n" + "ldr d5, [x20, #0x0]\n" + "mov v3.d[1], x23\n" + "mov v4.d[1], x21\n" + "mov v5.d[1], x19\n" + "bge 271b\n" + "272:" // Height 6: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x13, x13, #0x8\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "add x28, x28, #0x10\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "add x20, x20, #0x10\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "fmla v28.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "fmla v29.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "fmla v30.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "fmla v31.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "fmla v28.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "fmla v29.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "fmla v30.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "fmla v31.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "fmla v28.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "fmla v29.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "fmla v30.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x17, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "fmla v31.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x17, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "fmla v28.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x17, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "fmla v29.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x17, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "fmla v30.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x17, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "fmla v31.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x17, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "fmla v28.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x17, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "fmla v29.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x17, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "fmla v30.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x17, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "fmla v31.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x17, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "fmla v28.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x17, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "fmla v29.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x17, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "fmla v30.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x17, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "fmla v31.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x17, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "fmla v28.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x17, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "fmla v29.8h, v7.8h, v5.h[7]\n" + "ldr q7, [x17, #0x1f0]\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "273:" // Height 6: Multiply loop: Main loop skip + "cbz x13, 275f\n" + "274:" // Height 6: Multiply loop: Odd block loop + "ldr h0, [x12], #0x2\n" + "sub x13, x13, #0x1\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x26], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "cbnz x13, 274b\n" + "275:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 268b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #1\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #1\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 276f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.8h }, [x20]\n" + "ld1r { v0.8h }, [x19]\n" + "fmin v8.8h, v8.8h, v0.8h\n" + "fmin v9.8h, v9.8h, v0.8h\n" + "fmin v10.8h, v10.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v0.8h\n" + "fmin v12.8h, v12.8h, v0.8h\n" + "fmin v13.8h, v13.8h, v0.8h\n" + "fmin v14.8h, v14.8h, v0.8h\n" + "fmin v15.8h, v15.8h, v0.8h\n" + "fmin v16.8h, v16.8h, v0.8h\n" + "fmin v17.8h, v17.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v0.8h\n" + "fmin v21.8h, v21.8h, v0.8h\n" + "fmin v22.8h, v22.8h, v0.8h\n" + "fmin v23.8h, v23.8h, v0.8h\n" + "fmin v24.8h, v24.8h, v0.8h\n" + "fmin v25.8h, v25.8h, v0.8h\n" + "fmin v26.8h, v26.8h, v0.8h\n" + "fmin v27.8h, v27.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v20.8h, v20.8h, v1.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v22.8h, v22.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "fmax v24.8h, v24.8h, v1.8h\n" + "fmax v25.8h, v25.8h, v1.8h\n" + "fmax v26.8h, v26.8h, v1.8h\n" + "fmax v27.8h, v27.8h, v1.8h\n" + "fmin v28.8h, v28.8h, v0.8h\n" + "fmin v29.8h, v29.8h, v0.8h\n" + "fmin v30.8h, v30.8h, v0.8h\n" + "fmin v31.8h, v31.8h, v0.8h\n" + "fmax v28.8h, v28.8h, v1.8h\n" + "fmax v29.8h, v29.8h, v1.8h\n" + "fmax v30.8h, v30.8h, v1.8h\n" + "fmax v31.8h, v31.8h, v1.8h\n" + "276:" // Height 6: No activation + "cmp x8, #0x20\n" + "bge 293f\n" + "tbz x8, #4, 284f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v9.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v13.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v17.8h }, [x24], #0x10\n" + "st1 { v20.8h }, [x23], #0x10\n" + "st1 { v21.8h }, [x23], #0x10\n" + "st1 { v24.8h }, [x22], #0x10\n" + "st1 { v25.8h }, [x22], #0x10\n" + "st1 { v28.8h }, [x21], #0x10\n" + "st1 { v29.8h }, [x21], #0x10\n" + "tbz x8, #3, 280f\n" + "st1 { v10.8h }, [x15], #0x10\n" + "st1 { v14.8h }, [x25], #0x10\n" + "st1 { v18.8h }, [x24], #0x10\n" + "st1 { v22.8h }, [x23], #0x10\n" + "st1 { v26.8h }, [x22], #0x10\n" + "st1 { v30.8h }, [x21], #0x10\n" + "tbz x8, #2, 278f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "str d27, [x22], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x8, #1, 277f\n" + "st1 { v11.s }[2], [x15], #0x4\n" + "st1 { v15.s }[2], [x25], #0x4\n" + "st1 { v19.s }[2], [x24], #0x4\n" + "st1 { v23.s }[2], [x23], #0x4\n" + "st1 { v27.s }[2], [x22], #0x4\n" + "st1 { v31.s }[2], [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v11.h }[6], [x15]\n" + "st1 { v15.h }[6], [x25]\n" + "st1 { v19.h }[6], [x24]\n" + "st1 { v23.h }[6], [x23]\n" + "st1 { v27.h }[6], [x22]\n" + "st1 { v31.h }[6], [x21]\n" + "b 292f\n" + "277:" // Height 6: Partial direct writeback: partial_1_28 + "tbz x8, #0, 292f\n" + "st1 { v11.h }[4], [x15]\n" + "st1 { v15.h }[4], [x25]\n" + "st1 { v19.h }[4], [x24]\n" + "st1 { v23.h }[4], [x23]\n" + "st1 { v27.h }[4], [x22]\n" + "st1 { v31.h }[4], [x21]\n" + "b 292f\n" + "278:" // Height 6: Partial direct writeback: partial_2_24 + "tbz x8, #1, 279f\n" + "str s11, [x15], #0x4\n" + "str s15, [x25], #0x4\n" + "str s19, [x24], #0x4\n" + "str s23, [x23], #0x4\n" + "str s27, [x22], #0x4\n" + "str s31, [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v11.h }[2], [x15]\n" + "st1 { v15.h }[2], [x25]\n" + "st1 { v19.h }[2], [x24]\n" + "st1 { v23.h }[2], [x23]\n" + "st1 { v27.h }[2], [x22]\n" + "st1 { v31.h }[2], [x21]\n" + "b 292f\n" + "279:" // Height 6: Partial direct writeback: partial_1_24 + "tbz x8, #0, 292f\n" + "str h11, [x15, #0x0]\n" + "str h15, [x25, #0x0]\n" + "str h19, [x24, #0x0]\n" + "str h23, [x23, #0x0]\n" + "str h27, [x22, #0x0]\n" + "str h31, [x21, #0x0]\n" + "b 292f\n" + "280:" // Height 6: Partial direct writeback: partial_4_16 + "tbz x8, #2, 282f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d26, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x8, #1, 281f\n" + "st1 { v10.s }[2], [x15], #0x4\n" + "st1 { v14.s }[2], [x25], #0x4\n" + "st1 { v18.s }[2], [x24], #0x4\n" + "st1 { v22.s }[2], [x23], #0x4\n" + "st1 { v26.s }[2], [x22], #0x4\n" + "st1 { v30.s }[2], [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v10.h }[6], [x15]\n" + "st1 { v14.h }[6], [x25]\n" + "st1 { v18.h }[6], [x24]\n" + "st1 { v22.h }[6], [x23]\n" + "st1 { v26.h }[6], [x22]\n" + "st1 { v30.h }[6], [x21]\n" + "b 292f\n" + "281:" // Height 6: Partial direct writeback: partial_1_20 + "tbz x8, #0, 292f\n" + "st1 { v10.h }[4], [x15]\n" + "st1 { v14.h }[4], [x25]\n" + "st1 { v18.h }[4], [x24]\n" + "st1 { v22.h }[4], [x23]\n" + "st1 { v26.h }[4], [x22]\n" + "st1 { v30.h }[4], [x21]\n" + "b 292f\n" + "282:" // Height 6: Partial direct writeback: partial_2_16 + "tbz x8, #1, 283f\n" + "str s10, [x15], #0x4\n" + "str s14, [x25], #0x4\n" + "str s18, [x24], #0x4\n" + "str s22, [x23], #0x4\n" + "str s26, [x22], #0x4\n" + "str s30, [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v10.h }[2], [x15]\n" + "st1 { v14.h }[2], [x25]\n" + "st1 { v18.h }[2], [x24]\n" + "st1 { v22.h }[2], [x23]\n" + "st1 { v26.h }[2], [x22]\n" + "st1 { v30.h }[2], [x21]\n" + "b 292f\n" + "283:" // Height 6: Partial direct writeback: partial_1_16 + "tbz x8, #0, 292f\n" + "str h10, [x15, #0x0]\n" + "str h14, [x25, #0x0]\n" + "str h18, [x24, #0x0]\n" + "str h22, [x23, #0x0]\n" + "str h26, [x22, #0x0]\n" + "str h30, [x21, #0x0]\n" + "b 292f\n" + "284:" // Height 6: Partial direct writeback: partial_8_0 + "tbz x8, #3, 288f\n" + "st1 { v8.8h }, [x15], #0x10\n" + "st1 { v12.8h }, [x25], #0x10\n" + "st1 { v16.8h }, [x24], #0x10\n" + "st1 { v20.8h }, [x23], #0x10\n" + "st1 { v24.8h }, [x22], #0x10\n" + "st1 { v28.8h }, [x21], #0x10\n" + "tbz x8, #2, 286f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x8, #1, 285f\n" + "st1 { v9.s }[2], [x15], #0x4\n" + "st1 { v13.s }[2], [x25], #0x4\n" + "st1 { v17.s }[2], [x24], #0x4\n" + "st1 { v21.s }[2], [x23], #0x4\n" + "st1 { v25.s }[2], [x22], #0x4\n" + "st1 { v29.s }[2], [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v9.h }[6], [x15]\n" + "st1 { v13.h }[6], [x25]\n" + "st1 { v17.h }[6], [x24]\n" + "st1 { v21.h }[6], [x23]\n" + "st1 { v25.h }[6], [x22]\n" + "st1 { v29.h }[6], [x21]\n" + "b 292f\n" + "285:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x8, #0, 292f\n" + "st1 { v9.h }[4], [x15]\n" + "st1 { v13.h }[4], [x25]\n" + "st1 { v17.h }[4], [x24]\n" + "st1 { v21.h }[4], [x23]\n" + "st1 { v25.h }[4], [x22]\n" + "st1 { v29.h }[4], [x21]\n" + "b 292f\n" + "286:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x8, #1, 287f\n" + "str s9, [x15], #0x4\n" + "str s13, [x25], #0x4\n" + "str s17, [x24], #0x4\n" + "str s21, [x23], #0x4\n" + "str s25, [x22], #0x4\n" + "str s29, [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v9.h }[2], [x15]\n" + "st1 { v13.h }[2], [x25]\n" + "st1 { v17.h }[2], [x24]\n" + "st1 { v21.h }[2], [x23]\n" + "st1 { v25.h }[2], [x22]\n" + "st1 { v29.h }[2], [x21]\n" + "b 292f\n" + "287:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x8, #0, 292f\n" + "str h9, [x15, #0x0]\n" + "str h13, [x25, #0x0]\n" + "str h17, [x24, #0x0]\n" + "str h21, [x23, #0x0]\n" + "str h25, [x22, #0x0]\n" + "str h29, [x21, #0x0]\n" + "b 292f\n" + "288:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x8, #2, 290f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x8, #1, 289f\n" + "st1 { v8.s }[2], [x15], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v8.h }[6], [x15]\n" + "st1 { v12.h }[6], [x25]\n" + "st1 { v16.h }[6], [x24]\n" + "st1 { v20.h }[6], [x23]\n" + "st1 { v24.h }[6], [x22]\n" + "st1 { v28.h }[6], [x21]\n" + "b 292f\n" + "289:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x8, #0, 292f\n" + "st1 { v8.h }[4], [x15]\n" + "st1 { v12.h }[4], [x25]\n" + "st1 { v16.h }[4], [x24]\n" + "st1 { v20.h }[4], [x23]\n" + "st1 { v24.h }[4], [x22]\n" + "st1 { v28.h }[4], [x21]\n" + "b 292f\n" + "290:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x8, #1, 291f\n" + "str s8, [x15], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x8, #0, 292f\n" + "st1 { v8.h }[2], [x15]\n" + "st1 { v12.h }[2], [x25]\n" + "st1 { v16.h }[2], [x24]\n" + "st1 { v20.h }[2], [x23]\n" + "st1 { v24.h }[2], [x22]\n" + "st1 { v28.h }[2], [x21]\n" + "b 292f\n" + "291:" // Height 6: Partial direct writeback: partial_1_0 + "str h8, [x15, #0x0]\n" + "str h12, [x25, #0x0]\n" + "str h16, [x24, #0x0]\n" + "str h20, [x23, #0x0]\n" + "str h24, [x22, #0x0]\n" + "str h28, [x21, #0x0]\n" + "292:" // Height 6: Partial direct writeback: Done + "b 294f\n" + "293:" // Height 6: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q20, [x23, #0x0]\n" + "str q21, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q23, [x23, #0x30]\n" + "str q24, [x22, #0x0]\n" + "str q25, [x22, #0x10]\n" + "str q26, [x22, #0x20]\n" + "str q27, [x22, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "294:" // Height 6: Writeback done + "subs x8, x8, #0x20\n" + "bgt 247b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 296f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 295f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "295:" // Update direct input + "mov x19, #0xc\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "296:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp index 32e341566f..6e51773166 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp @@ -27,6 +27,7 @@ #include "../../utils.hpp" #include +#include namespace arm_gemm { @@ -96,342 +97,336 @@ void a64_hybrid_fp16_mla_6x32 ( #endif "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 251f\n" + "bge 246f\n" "cmp %x[M], #0x4\n" - "bgt 201f\n" - "beq 151f\n" + "bgt 197f\n" + "beq 148f\n" "cmp %x[M], #0x2\n" - "bgt 101f\n" - "beq 51f\n" - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop - "cbz x14, 4f\n" - "ldr q8, [x14, #0x0]\n" - "ldr q9, [x14, #0x10]\n" - "ldr q10, [x14, #0x20]\n" - "ldr q11, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "b 23f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 22f\n" - "cmp x16, #0x20\n" - "bge 21f\n" - "tbz x16, #4, 12f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v9.8h }, [x13], #0x10\n" - "tbz x16, #3, 8f\n" - "ld1 { v10.8h }, [x13], #0x10\n" - "tbz x16, #2, 6f\n" - "ldr d11, [x13], #0x8\n" - "tbz x16, #1, 5f\n" + "bgt 99f\n" + "beq 50f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x9, 3f\n" + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "b 22f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 21f\n" + "cmp x11, #0x20\n" + "bge 20f\n" + "tbz x11, #4, 11f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v9.8h }, [x28], #0x10\n" + "tbz x11, #3, 7f\n" + "ld1 { v10.8h }, [x28], #0x10\n" + "tbz x11, #2, 5f\n" + "ldr d11, [x28], #0x8\n" + "tbz x11, #1, 4f\n" "mov x19, #0x3c\n" - "ld1 { v11.s }[2], [x13], #0x4\n" - "tbz x16, #0, 20f\n" - "ld1 { v11.h }[6], [x13]\n" - "b 20f\n" - "5:" // Height 1: Partial accumulate: partial_1_28 + "ld1 { v11.s }[2], [x28], #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v11.h }[6], [x28]\n" + "b 19f\n" + "4:" // Height 1: Partial accumulate: partial_1_28 "mov x19, #0x38\n" - "tbz x16, #0, 20f\n" - "ld1 { v11.h }[4], [x13]\n" - "b 20f\n" - "6:" // Height 1: Partial accumulate: partial_2_24 - "tbz x16, #1, 7f\n" - "ldr s11, [x13], #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v11.h }[4], [x28]\n" + "b 19f\n" + "5:" // Height 1: Partial accumulate: partial_2_24 + "tbz x11, #1, 6f\n" + "ldr s11, [x28], #0x4\n" "mov x19, #0x34\n" - "tbz x16, #0, 20f\n" - "ld1 { v11.h }[2], [x13]\n" - "b 20f\n" - "7:" // Height 1: Partial accumulate: partial_1_24 + "tbz x11, #0, 19f\n" + "ld1 { v11.h }[2], [x28]\n" + "b 19f\n" + "6:" // Height 1: Partial accumulate: partial_1_24 "mov x19, #0x30\n" - "tbz x16, #0, 20f\n" - "ldr h11, [x13, #0x0]\n" - "b 20f\n" - "8:" // Height 1: Partial accumulate: partial_4_16 - "tbz x16, #2, 10f\n" - "ldr d10, [x13], #0x8\n" - "tbz x16, #1, 9f\n" + "tbz x11, #0, 19f\n" + "ldr h11, [x28, #0x0]\n" + "b 19f\n" + "7:" // Height 1: Partial accumulate: partial_4_16 + "tbz x11, #2, 9f\n" + "ldr d10, [x28], #0x8\n" + "tbz x11, #1, 8f\n" + "ld1 { v10.s }[2], [x28], #0x4\n" "mov x19, #0x2c\n" - "ld1 { v10.s }[2], [x13], #0x4\n" - "tbz x16, #0, 20f\n" - "ld1 { v10.h }[6], [x13]\n" - "b 20f\n" - "9:" // Height 1: Partial accumulate: partial_1_20 + "tbz x11, #0, 19f\n" + "ld1 { v10.h }[6], [x28]\n" + "b 19f\n" + "8:" // Height 1: Partial accumulate: partial_1_20 "mov x19, #0x28\n" - "tbz x16, #0, 20f\n" - "ld1 { v10.h }[4], [x13]\n" - "b 20f\n" - "10:" // Height 1: Partial accumulate: partial_2_16 - "tbz x16, #1, 11f\n" - "ldr s10, [x13], #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v10.h }[4], [x28]\n" + "b 19f\n" + "9:" // Height 1: Partial accumulate: partial_2_16 + "tbz x11, #1, 10f\n" + "ldr s10, [x28], #0x4\n" "mov x19, #0x24\n" - "tbz x16, #0, 20f\n" - "ld1 { v10.h }[2], [x13]\n" - "b 20f\n" - "11:" // Height 1: Partial accumulate: partial_1_16 + "tbz x11, #0, 19f\n" + "ld1 { v10.h }[2], [x28]\n" + "b 19f\n" + "10:" // Height 1: Partial accumulate: partial_1_16 "mov x19, #0x20\n" - "tbz x16, #0, 20f\n" - "ldr h10, [x13, #0x0]\n" - "b 20f\n" - "12:" // Height 1: Partial accumulate: partial_8_0 - "tbz x16, #3, 16f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "tbz x16, #2, 14f\n" - "ldr d9, [x13], #0x8\n" - "tbz x16, #1, 13f\n" + "tbz x11, #0, 19f\n" + "ldr h10, [x28, #0x0]\n" + "b 19f\n" + "11:" // Height 1: Partial accumulate: partial_8_0 + "tbz x11, #3, 15f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "tbz x11, #2, 13f\n" + "ldr d9, [x28], #0x8\n" + "tbz x11, #1, 12f\n" "mov x19, #0x1c\n" - "ld1 { v9.s }[2], [x13], #0x4\n" - "tbz x16, #0, 20f\n" - "ld1 { v9.h }[6], [x13]\n" - "b 20f\n" - "13:" // Height 1: Partial accumulate: partial_1_12 + "ld1 { v9.s }[2], [x28], #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v9.h }[6], [x28]\n" + "b 19f\n" + "12:" // Height 1: Partial accumulate: partial_1_12 "mov x19, #0x18\n" - "tbz x16, #0, 20f\n" - "ld1 { v9.h }[4], [x13]\n" - "b 20f\n" - "14:" // Height 1: Partial accumulate: partial_2_8 - "tbz x16, #1, 15f\n" - "ldr s9, [x13], #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v9.h }[4], [x28]\n" + "b 19f\n" + "13:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 14f\n" + "ldr s9, [x28], #0x4\n" "mov x19, #0x14\n" - "tbz x16, #0, 20f\n" - "ld1 { v9.h }[2], [x13]\n" - "b 20f\n" - "15:" // Height 1: Partial accumulate: partial_1_8 + "tbz x11, #0, 19f\n" + "ld1 { v9.h }[2], [x28]\n" + "b 19f\n" + "14:" // Height 1: Partial accumulate: partial_1_8 "mov x19, #0x10\n" - "tbz x16, #0, 20f\n" - "ldr h9, [x13, #0x0]\n" - "b 20f\n" - "16:" // Height 1: Partial accumulate: partial_4_0 - "tbz x16, #2, 18f\n" - "ldr d8, [x13], #0x8\n" - "tbz x16, #1, 17f\n" + "tbz x11, #0, 19f\n" + "ldr h9, [x28, #0x0]\n" + "b 19f\n" + "15:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 17f\n" + "ldr d8, [x28], #0x8\n" + "tbz x11, #1, 16f\n" + "ld1 { v8.s }[2], [x28], #0x4\n" "mov x19, #0xc\n" - "ld1 { v8.s }[2], [x13], #0x4\n" - "tbz x16, #0, 20f\n" - "ld1 { v8.h }[6], [x13]\n" - "b 20f\n" - "17:" // Height 1: Partial accumulate: partial_1_4 + "tbz x11, #0, 19f\n" + "ld1 { v8.h }[6], [x28]\n" + "b 19f\n" + "16:" // Height 1: Partial accumulate: partial_1_4 "mov x19, #0x8\n" - "tbz x16, #0, 20f\n" - "ld1 { v8.h }[4], [x13]\n" - "b 20f\n" - "18:" // Height 1: Partial accumulate: partial_2_0 - "tbz x16, #1, 19f\n" - "ldr s8, [x13], #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v8.h }[4], [x28]\n" + "b 19f\n" + "17:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 18f\n" + "ldr s8, [x28], #0x4\n" "mov x19, #0x4\n" - "tbz x16, #0, 20f\n" - "ld1 { v8.h }[2], [x13]\n" - "b 20f\n" - "19:" // Height 1: Partial accumulate: partial_1_0 + "tbz x11, #0, 19f\n" + "ld1 { v8.h }[2], [x28]\n" + "b 19f\n" + "18:" // Height 1: Partial accumulate: partial_1_0 + "ldr h8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr h8, [x13, #0x0]\n" - "20:" // Height 1: Partial accumulate: Done - "sub x13, x13, x19\n" - "b 23f\n" - "21:" // Height 1: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "b 23f\n" - "22:" // Height 1: no accumulate + "19:" // Height 1: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 22f\n" + "20:" // Height 1: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "b 22f\n" + "21:" // Height 1: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" "movi v11.16b, #0x0\n" - "23:" // Height 1: setup done - "mov x12, #0x0\n" - "24:" // Height 1: String loop + "22:" // Height 1: setup done + "mov x27, #0x0\n" + "23:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 25f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 24f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 26f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 25f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "b 26f\n" - "25:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "26:" // Height 1: input setup done - "cmp x11, #0x8\n" - "blt 29f\n" - "cmp x11, #0x10\n" + "add x25, x25, x19, LSL #1\n" + "b 25f\n" + "24:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "25:" // Height 1: input setup done + "cmp x26, #0x8\n" "blt 28f\n" - "27:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "cmp x26, #0x10\n" + "blt 27f\n" + "26:" // Height 1: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x8\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x15, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "cmp x26, #0x10\n" "fmla v11.8h, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q6, [x10, #0x40]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x60]\n" - "sub x11, x11, #0x8\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x70]\n" - "cmp x11, #0x10\n" + "ldr q7, [x10, #0x70]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" - "bge 27b\n" - "28:" // Height 1: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "bge 26b\n" + "27:" // Height 1: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x15, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" - "29:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 31f\n" - "30:" // Height 1: Multiply loop: Odd block loop - "ldr h0, [x10], #0x2\n" - "ldr q6, [x15, #0x0]\n" + "28:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 30f\n" + "29:" // Height 1: Multiply loop: Odd block loop + "ldr h0, [x25], #0x2\n" + "sub x26, x26, #0x1\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "sub x11, x11, #0x1\n" - "add x15, x15, #0x40\n" "fmla v11.8h, v7.8h, v0.h[0]\n" - "cbnz x11, 30b\n" - "31:" // Height 1: Multiply loop: No odd multiplies + "cbnz x26, 29b\n" + "30:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 24b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "tbz %x[flags], #1, 32f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 23b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "tbz %x[flags], #1, 31f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.8h }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -444,298 +439,290 @@ void a64_hybrid_fp16_mla_6x32 ( "fmax v9.8h, v9.8h, v1.8h\n" "fmax v10.8h, v10.8h, v1.8h\n" "fmax v11.8h, v11.8h, v1.8h\n" - "32:" // Height 1: No activation - "cmp x16, #0x20\n" - "bge 49f\n" - "tbz x16, #4, 40f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v9.8h }, [x13], #0x10\n" - "tbz x16, #3, 36f\n" - "st1 { v10.8h }, [x13], #0x10\n" - "tbz x16, #2, 34f\n" - "str d11, [x13], #0x8\n" - "tbz x16, #1, 33f\n" - "st1 { v11.s }[2], [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v11.h }[6], [x13]\n" - "b 48f\n" - "33:" // Height 1: Partial direct writeback: partial_1_28 - "tbz x16, #0, 48f\n" - "st1 { v11.h }[4], [x13]\n" - "b 48f\n" - "34:" // Height 1: Partial direct writeback: partial_2_24 - "tbz x16, #1, 35f\n" - "str s11, [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v11.h }[2], [x13]\n" - "b 48f\n" - "35:" // Height 1: Partial direct writeback: partial_1_24 - "tbz x16, #0, 48f\n" - "str h11, [x13, #0x0]\n" - "b 48f\n" - "36:" // Height 1: Partial direct writeback: partial_4_16 - "tbz x16, #2, 38f\n" - "str d10, [x13], #0x8\n" - "tbz x16, #1, 37f\n" - "st1 { v10.s }[2], [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v10.h }[6], [x13]\n" - "b 48f\n" - "37:" // Height 1: Partial direct writeback: partial_1_20 - "tbz x16, #0, 48f\n" - "st1 { v10.h }[4], [x13]\n" - "b 48f\n" - "38:" // Height 1: Partial direct writeback: partial_2_16 - "tbz x16, #1, 39f\n" - "str s10, [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v10.h }[2], [x13]\n" - "b 48f\n" - "39:" // Height 1: Partial direct writeback: partial_1_16 - "tbz x16, #0, 48f\n" - "str h10, [x13, #0x0]\n" - "b 48f\n" - "40:" // Height 1: Partial direct writeback: partial_8_0 - "tbz x16, #3, 44f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "tbz x16, #2, 42f\n" - "str d9, [x13], #0x8\n" - "tbz x16, #1, 41f\n" - "st1 { v9.s }[2], [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v9.h }[6], [x13]\n" - "b 48f\n" - "41:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x16, #0, 48f\n" - "st1 { v9.h }[4], [x13]\n" - "b 48f\n" - "42:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x16, #1, 43f\n" - "str s9, [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v9.h }[2], [x13]\n" - "b 48f\n" - "43:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x16, #0, 48f\n" - "str h9, [x13, #0x0]\n" - "b 48f\n" - "44:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x16, #2, 46f\n" - "str d8, [x13], #0x8\n" - "tbz x16, #1, 45f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v8.h }[6], [x13]\n" - "b 48f\n" - "45:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x16, #0, 48f\n" - "st1 { v8.h }[4], [x13]\n" - "b 48f\n" - "46:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x16, #1, 47f\n" - "str s8, [x13], #0x4\n" - "tbz x16, #0, 48f\n" - "st1 { v8.h }[2], [x13]\n" - "b 48f\n" - "47:" // Height 1: Partial direct writeback: partial_1_0 - "str h8, [x13, #0x0]\n" - "48:" // Height 1: Partial direct writeback: Done - "b 50f\n" - "49:" // Height 1: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "add x13, x13, #0x40\n" - "50:" // Height 1: Writeback done - "subs x16, x16, #0x20\n" - "bgt 3b\n" - "b 302f\n" - "51:" // Height 2 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 52f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #1\n" - "b 53f\n" - "52:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "53:" // Height 2: Column loop - "cbz x14, 54f\n" - "ldr q8, [x14, #0x0]\n" + "31:" // Height 1: No activation + "cmp x11, #0x20\n" + "bge 48f\n" + "tbz x11, #4, 39f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v9.8h }, [x28], #0x10\n" + "tbz x11, #3, 35f\n" + "st1 { v10.8h }, [x28], #0x10\n" + "tbz x11, #2, 33f\n" + "str d11, [x28], #0x8\n" + "tbz x11, #1, 32f\n" + "st1 { v11.s }[2], [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v11.h }[6], [x28]\n" + "b 47f\n" + "32:" // Height 1: Partial direct writeback: partial_1_28 + "tbz x11, #0, 47f\n" + "st1 { v11.h }[4], [x28]\n" + "b 47f\n" + "33:" // Height 1: Partial direct writeback: partial_2_24 + "tbz x11, #1, 34f\n" + "str s11, [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v11.h }[2], [x28]\n" + "b 47f\n" + "34:" // Height 1: Partial direct writeback: partial_1_24 + "tbz x11, #0, 47f\n" + "str h11, [x28, #0x0]\n" + "b 47f\n" + "35:" // Height 1: Partial direct writeback: partial_4_16 + "tbz x11, #2, 37f\n" + "str d10, [x28], #0x8\n" + "tbz x11, #1, 36f\n" + "st1 { v10.s }[2], [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v10.h }[6], [x28]\n" + "b 47f\n" + "36:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x11, #0, 47f\n" + "st1 { v10.h }[4], [x28]\n" + "b 47f\n" + "37:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x11, #1, 38f\n" + "str s10, [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v10.h }[2], [x28]\n" + "b 47f\n" + "38:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x11, #0, 47f\n" + "str h10, [x28, #0x0]\n" + "b 47f\n" + "39:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x11, #3, 43f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "tbz x11, #2, 41f\n" + "str d9, [x28], #0x8\n" + "tbz x11, #1, 40f\n" + "st1 { v9.s }[2], [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v9.h }[6], [x28]\n" + "b 47f\n" + "40:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 47f\n" + "st1 { v9.h }[4], [x28]\n" + "b 47f\n" + "41:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 42f\n" + "str s9, [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v9.h }[2], [x28]\n" + "b 47f\n" + "42:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 47f\n" + "str h9, [x28, #0x0]\n" + "b 47f\n" + "43:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 45f\n" + "str d8, [x28], #0x8\n" + "tbz x11, #1, 44f\n" + "st1 { v8.s }[2], [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v8.h }[6], [x28]\n" + "b 47f\n" + "44:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 47f\n" + "st1 { v8.h }[4], [x28]\n" + "b 47f\n" + "45:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 46f\n" + "str s8, [x28], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v8.h }[2], [x28]\n" + "b 47f\n" + "46:" // Height 1: Partial direct writeback: partial_1_0 + "str h8, [x28, #0x0]\n" + "47:" // Height 1: Partial direct writeback: Done + "b 49f\n" + "48:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "49:" // Height 1: Writeback done + "subs x11, x11, #0x20\n" + "bgt 2b\n" + "b 296f\n" + "50:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "51:" // Height 2: Column loop + "cbz x9, 52f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" - "ldr q10, [x14, #0x20]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" "mov v13.16b, v9.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v14.16b, v10.16b\n" - "add x14, x14, #0x40\n" "mov v15.16b, v11.16b\n" - "b 73f\n" - "54:" // Height 2: no bias - "tbz %x[flags], #0, 72f\n" - "cmp x16, #0x20\n" - "bge 71f\n" - "tbz x16, #4, 62f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v9.8h }, [x13], #0x10\n" - "ld1 { v13.8h }, [x9], #0x10\n" - "tbz x16, #3, 58f\n" - "ld1 { v10.8h }, [x13], #0x10\n" - "ld1 { v14.8h }, [x9], #0x10\n" - "tbz x16, #2, 56f\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "tbz x16, #1, 55f\n" + "b 71f\n" + "52:" // Height 2: no bias + "tbz %x[flags], #0, 70f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x24, x28, x19, LSL #1\n" + "bge 69f\n" + "tbz x11, #4, 60f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v9.8h }, [x28], #0x10\n" + "ld1 { v13.8h }, [x24], #0x10\n" + "tbz x11, #3, 56f\n" + "ld1 { v10.8h }, [x28], #0x10\n" + "ld1 { v14.8h }, [x24], #0x10\n" + "tbz x11, #2, 54f\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "tbz x11, #1, 53f\n" "mov x19, #0x3c\n" - "ld1 { v11.s }[2], [x13], #0x4\n" - "ld1 { v15.s }[2], [x9], #0x4\n" - "tbz x16, #0, 70f\n" - "ld1 { v11.h }[6], [x13]\n" - "ld1 { v15.h }[6], [x9]\n" - "b 70f\n" - "55:" // Height 2: Partial accumulate: partial_1_28 + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v15.s }[2], [x24], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v11.h }[6], [x28]\n" + "ld1 { v15.h }[6], [x24]\n" + "b 68f\n" + "53:" // Height 2: Partial accumulate: partial_1_28 "mov x19, #0x38\n" - "tbz x16, #0, 70f\n" - "ld1 { v11.h }[4], [x13]\n" - "ld1 { v15.h }[4], [x9]\n" - "b 70f\n" - "56:" // Height 2: Partial accumulate: partial_2_24 - "tbz x16, #1, 57f\n" - "ldr s11, [x13], #0x4\n" - "ldr s15, [x9], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v11.h }[4], [x28]\n" + "ld1 { v15.h }[4], [x24]\n" + "b 68f\n" + "54:" // Height 2: Partial accumulate: partial_2_24 + "tbz x11, #1, 55f\n" + "ldr s11, [x28], #0x4\n" + "ldr s15, [x24], #0x4\n" "mov x19, #0x34\n" - "tbz x16, #0, 70f\n" - "ld1 { v11.h }[2], [x13]\n" - "ld1 { v15.h }[2], [x9]\n" - "b 70f\n" - "57:" // Height 2: Partial accumulate: partial_1_24 + "tbz x11, #0, 68f\n" + "ld1 { v11.h }[2], [x28]\n" + "ld1 { v15.h }[2], [x24]\n" + "b 68f\n" + "55:" // Height 2: Partial accumulate: partial_1_24 "mov x19, #0x30\n" - "tbz x16, #0, 70f\n" - "ldr h11, [x13, #0x0]\n" - "ldr h15, [x9, #0x0]\n" - "b 70f\n" - "58:" // Height 2: Partial accumulate: partial_4_16 - "tbz x16, #2, 60f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "tbz x16, #1, 59f\n" + "tbz x11, #0, 68f\n" + "ldr h11, [x28, #0x0]\n" + "ldr h15, [x24, #0x0]\n" + "b 68f\n" + "56:" // Height 2: Partial accumulate: partial_4_16 + "tbz x11, #2, 58f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "tbz x11, #1, 57f\n" "mov x19, #0x2c\n" - "ld1 { v10.s }[2], [x13], #0x4\n" - "ld1 { v14.s }[2], [x9], #0x4\n" - "tbz x16, #0, 70f\n" - "ld1 { v10.h }[6], [x13]\n" - "ld1 { v14.h }[6], [x9]\n" - "b 70f\n" - "59:" // Height 2: Partial accumulate: partial_1_20 + "ld1 { v10.s }[2], [x28], #0x4\n" + "ld1 { v14.s }[2], [x24], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v10.h }[6], [x28]\n" + "ld1 { v14.h }[6], [x24]\n" + "b 68f\n" + "57:" // Height 2: Partial accumulate: partial_1_20 "mov x19, #0x28\n" - "tbz x16, #0, 70f\n" - "ld1 { v10.h }[4], [x13]\n" - "ld1 { v14.h }[4], [x9]\n" - "b 70f\n" - "60:" // Height 2: Partial accumulate: partial_2_16 - "tbz x16, #1, 61f\n" - "ldr s10, [x13], #0x4\n" - "ldr s14, [x9], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v10.h }[4], [x28]\n" + "ld1 { v14.h }[4], [x24]\n" + "b 68f\n" + "58:" // Height 2: Partial accumulate: partial_2_16 + "tbz x11, #1, 59f\n" + "ldr s10, [x28], #0x4\n" + "ldr s14, [x24], #0x4\n" "mov x19, #0x24\n" - "tbz x16, #0, 70f\n" - "ld1 { v10.h }[2], [x13]\n" - "ld1 { v14.h }[2], [x9]\n" - "b 70f\n" - "61:" // Height 2: Partial accumulate: partial_1_16 + "tbz x11, #0, 68f\n" + "ld1 { v10.h }[2], [x28]\n" + "ld1 { v14.h }[2], [x24]\n" + "b 68f\n" + "59:" // Height 2: Partial accumulate: partial_1_16 "mov x19, #0x20\n" - "tbz x16, #0, 70f\n" - "ldr h10, [x13, #0x0]\n" - "ldr h14, [x9, #0x0]\n" - "b 70f\n" - "62:" // Height 2: Partial accumulate: partial_8_0 - "tbz x16, #3, 66f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "tbz x16, #2, 64f\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "tbz x16, #1, 63f\n" + "tbz x11, #0, 68f\n" + "ldr h10, [x28, #0x0]\n" + "ldr h14, [x24, #0x0]\n" + "b 68f\n" + "60:" // Height 2: Partial accumulate: partial_8_0 + "tbz x11, #3, 64f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "tbz x11, #2, 62f\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "tbz x11, #1, 61f\n" "mov x19, #0x1c\n" - "ld1 { v9.s }[2], [x13], #0x4\n" - "ld1 { v13.s }[2], [x9], #0x4\n" - "tbz x16, #0, 70f\n" - "ld1 { v9.h }[6], [x13]\n" - "ld1 { v13.h }[6], [x9]\n" - "b 70f\n" - "63:" // Height 2: Partial accumulate: partial_1_12 + "ld1 { v9.s }[2], [x28], #0x4\n" + "ld1 { v13.s }[2], [x24], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v9.h }[6], [x28]\n" + "ld1 { v13.h }[6], [x24]\n" + "b 68f\n" + "61:" // Height 2: Partial accumulate: partial_1_12 "mov x19, #0x18\n" - "tbz x16, #0, 70f\n" - "ld1 { v9.h }[4], [x13]\n" - "ld1 { v13.h }[4], [x9]\n" - "b 70f\n" - "64:" // Height 2: Partial accumulate: partial_2_8 - "tbz x16, #1, 65f\n" - "ldr s9, [x13], #0x4\n" - "ldr s13, [x9], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v9.h }[4], [x28]\n" + "ld1 { v13.h }[4], [x24]\n" + "b 68f\n" + "62:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 63f\n" + "ldr s9, [x28], #0x4\n" + "ldr s13, [x24], #0x4\n" "mov x19, #0x14\n" - "tbz x16, #0, 70f\n" - "ld1 { v9.h }[2], [x13]\n" - "ld1 { v13.h }[2], [x9]\n" - "b 70f\n" - "65:" // Height 2: Partial accumulate: partial_1_8 + "tbz x11, #0, 68f\n" + "ld1 { v9.h }[2], [x28]\n" + "ld1 { v13.h }[2], [x24]\n" + "b 68f\n" + "63:" // Height 2: Partial accumulate: partial_1_8 "mov x19, #0x10\n" - "tbz x16, #0, 70f\n" - "ldr h9, [x13, #0x0]\n" - "ldr h13, [x9, #0x0]\n" - "b 70f\n" - "66:" // Height 2: Partial accumulate: partial_4_0 - "tbz x16, #2, 68f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "tbz x16, #1, 67f\n" + "tbz x11, #0, 68f\n" + "ldr h9, [x28, #0x0]\n" + "ldr h13, [x24, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 66f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "tbz x11, #1, 65f\n" "mov x19, #0xc\n" - "ld1 { v8.s }[2], [x13], #0x4\n" - "ld1 { v12.s }[2], [x9], #0x4\n" - "tbz x16, #0, 70f\n" - "ld1 { v8.h }[6], [x13]\n" - "ld1 { v12.h }[6], [x9]\n" - "b 70f\n" - "67:" // Height 2: Partial accumulate: partial_1_4 + "ld1 { v8.s }[2], [x28], #0x4\n" + "ld1 { v12.s }[2], [x24], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v8.h }[6], [x28]\n" + "ld1 { v12.h }[6], [x24]\n" + "b 68f\n" + "65:" // Height 2: Partial accumulate: partial_1_4 "mov x19, #0x8\n" - "tbz x16, #0, 70f\n" - "ld1 { v8.h }[4], [x13]\n" - "ld1 { v12.h }[4], [x9]\n" - "b 70f\n" - "68:" // Height 2: Partial accumulate: partial_2_0 - "tbz x16, #1, 69f\n" - "ldr s8, [x13], #0x4\n" - "ldr s12, [x9], #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v8.h }[4], [x28]\n" + "ld1 { v12.h }[4], [x24]\n" + "b 68f\n" + "66:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 67f\n" + "ldr s8, [x28], #0x4\n" + "ldr s12, [x24], #0x4\n" "mov x19, #0x4\n" - "tbz x16, #0, 70f\n" - "ld1 { v8.h }[2], [x13]\n" - "ld1 { v12.h }[2], [x9]\n" - "b 70f\n" - "69:" // Height 2: Partial accumulate: partial_1_0 + "tbz x11, #0, 68f\n" + "ld1 { v8.h }[2], [x28]\n" + "ld1 { v12.h }[2], [x24]\n" + "b 68f\n" + "67:" // Height 2: Partial accumulate: partial_1_0 + "ldr h8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr h8, [x13, #0x0]\n" - "ldr h12, [x9, #0x0]\n" - "70:" // Height 2: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "b 73f\n" - "71:" // Height 2: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "b 73f\n" - "72:" // Height 2: no accumulate + "ldr h12, [x24, #0x0]\n" + "68:" // Height 2: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 71f\n" + "69:" // Height 2: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "b 71f\n" + "70:" // Height 2: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -744,270 +731,272 @@ void a64_hybrid_fp16_mla_6x32 ( "movi v13.16b, #0x0\n" "movi v14.16b, #0x0\n" "movi v15.16b, #0x0\n" - "73:" // Height 2: setup done - "mov x12, #0x0\n" - "74:" // Height 2: String loop + "71:" // Height 2: setup done + "mov x27, #0x0\n" + "72:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 75f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 73f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 76f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 74f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "b 76f\n" - "75:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "76:" // Height 2: input setup done - "cmp x11, #0x8\n" - "blt 79f\n" - "cmp x11, #0x10\n" - "blt 78f\n" - "77:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 74f\n" + "73:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "74:" // Height 2: input setup done + "cmp x26, #0x8\n" + "blt 77f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q6, [x10, #0x0]\n" + "blt 76f\n" + "75:" // Height 2: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x24, x24, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x8\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "cmp x26, #0x10\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "sub x11, x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x40]\n" - "cmp x11, #0x10\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" - "bge 77b\n" - "78:" // Height 2: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "bge 75b\n" + "76:" // Height 2: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" - "79:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 81f\n" - "80:" // Height 2: Multiply loop: Odd block loop - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr q6, [x15, #0x0]\n" + "77:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 79f\n" + "78:" // Height 2: Multiply loop: Odd block loop + "ldr h0, [x25], #0x2\n" + "sub x26, x26, #0x1\n" + "ldr h1, [x24], #0x2\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x15, #0x20]\n" - "sub x11, x11, #0x1\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" - "cbnz x11, 80b\n" - "81:" // Height 2: Multiply loop: No odd multiplies + "cbnz x26, 78b\n" + "79:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 74b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbz %x[flags], #1, 82f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 72b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 80f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.8h }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1028,379 +1017,367 @@ void a64_hybrid_fp16_mla_6x32 ( "fmax v14.8h, v14.8h, v1.8h\n" "fmin v15.8h, v15.8h, v0.8h\n" "fmax v15.8h, v15.8h, v1.8h\n" - "82:" // Height 2: No activation - "cmp x16, #0x20\n" - "bge 99f\n" - "tbz x16, #4, 90f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v9.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v13.8h }, [x9], #0x10\n" - "tbz x16, #3, 86f\n" - "st1 { v10.8h }, [x13], #0x10\n" - "st1 { v14.8h }, [x9], #0x10\n" - "tbz x16, #2, 84f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "tbz x16, #1, 83f\n" - "st1 { v11.s }[2], [x13], #0x4\n" - "st1 { v15.s }[2], [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v11.h }[6], [x13]\n" - "st1 { v15.h }[6], [x9]\n" - "b 98f\n" - "83:" // Height 2: Partial direct writeback: partial_1_28 - "tbz x16, #0, 98f\n" - "st1 { v11.h }[4], [x13]\n" - "st1 { v15.h }[4], [x9]\n" - "b 98f\n" - "84:" // Height 2: Partial direct writeback: partial_2_24 - "tbz x16, #1, 85f\n" - "str s11, [x13], #0x4\n" - "str s15, [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v11.h }[2], [x13]\n" - "st1 { v15.h }[2], [x9]\n" - "b 98f\n" - "85:" // Height 2: Partial direct writeback: partial_1_24 - "tbz x16, #0, 98f\n" - "str h11, [x13, #0x0]\n" - "str h15, [x9, #0x0]\n" - "b 98f\n" - "86:" // Height 2: Partial direct writeback: partial_4_16 - "tbz x16, #2, 88f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "tbz x16, #1, 87f\n" - "st1 { v10.s }[2], [x13], #0x4\n" - "st1 { v14.s }[2], [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v10.h }[6], [x13]\n" - "st1 { v14.h }[6], [x9]\n" - "b 98f\n" - "87:" // Height 2: Partial direct writeback: partial_1_20 - "tbz x16, #0, 98f\n" - "st1 { v10.h }[4], [x13]\n" - "st1 { v14.h }[4], [x9]\n" - "b 98f\n" - "88:" // Height 2: Partial direct writeback: partial_2_16 - "tbz x16, #1, 89f\n" - "str s10, [x13], #0x4\n" - "str s14, [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v10.h }[2], [x13]\n" - "st1 { v14.h }[2], [x9]\n" - "b 98f\n" - "89:" // Height 2: Partial direct writeback: partial_1_16 - "tbz x16, #0, 98f\n" - "str h10, [x13, #0x0]\n" - "str h14, [x9, #0x0]\n" - "b 98f\n" - "90:" // Height 2: Partial direct writeback: partial_8_0 - "tbz x16, #3, 94f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "tbz x16, #2, 92f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "tbz x16, #1, 91f\n" - "st1 { v9.s }[2], [x13], #0x4\n" - "st1 { v13.s }[2], [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v9.h }[6], [x13]\n" - "st1 { v13.h }[6], [x9]\n" + "80:" // Height 2: No activation + "cmp x11, #0x20\n" + "bge 97f\n" + "tbz x11, #4, 88f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v9.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v13.8h }, [x24], #0x10\n" + "tbz x11, #3, 84f\n" + "st1 { v10.8h }, [x28], #0x10\n" + "st1 { v14.8h }, [x24], #0x10\n" + "tbz x11, #2, 82f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "tbz x11, #1, 81f\n" + "st1 { v11.s }[2], [x28], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v11.h }[6], [x28]\n" + "st1 { v15.h }[6], [x24]\n" + "b 96f\n" + "81:" // Height 2: Partial direct writeback: partial_1_28 + "tbz x11, #0, 96f\n" + "st1 { v11.h }[4], [x28]\n" + "st1 { v15.h }[4], [x24]\n" + "b 96f\n" + "82:" // Height 2: Partial direct writeback: partial_2_24 + "tbz x11, #1, 83f\n" + "str s11, [x28], #0x4\n" + "str s15, [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v11.h }[2], [x28]\n" + "st1 { v15.h }[2], [x24]\n" + "b 96f\n" + "83:" // Height 2: Partial direct writeback: partial_1_24 + "tbz x11, #0, 96f\n" + "str h11, [x28, #0x0]\n" + "str h15, [x24, #0x0]\n" + "b 96f\n" + "84:" // Height 2: Partial direct writeback: partial_4_16 + "tbz x11, #2, 86f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "tbz x11, #1, 85f\n" + "st1 { v10.s }[2], [x28], #0x4\n" + "st1 { v14.s }[2], [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v10.h }[6], [x28]\n" + "st1 { v14.h }[6], [x24]\n" + "b 96f\n" + "85:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x11, #0, 96f\n" + "st1 { v10.h }[4], [x28]\n" + "st1 { v14.h }[4], [x24]\n" + "b 96f\n" + "86:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x11, #1, 87f\n" + "str s10, [x28], #0x4\n" + "str s14, [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v10.h }[2], [x28]\n" + "st1 { v14.h }[2], [x24]\n" + "b 96f\n" + "87:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x11, #0, 96f\n" + "str h10, [x28, #0x0]\n" + "str h14, [x24, #0x0]\n" + "b 96f\n" + "88:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x11, #3, 92f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "tbz x11, #2, 90f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "tbz x11, #1, 89f\n" + "st1 { v9.s }[2], [x28], #0x4\n" + "st1 { v13.s }[2], [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v9.h }[6], [x28]\n" + "st1 { v13.h }[6], [x24]\n" + "b 96f\n" + "89:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 96f\n" + "st1 { v9.h }[4], [x28]\n" + "st1 { v13.h }[4], [x24]\n" + "b 96f\n" + "90:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 91f\n" + "str s9, [x28], #0x4\n" + "str s13, [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v9.h }[2], [x28]\n" + "st1 { v13.h }[2], [x24]\n" + "b 96f\n" + "91:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 96f\n" + "str h9, [x28, #0x0]\n" + "str h13, [x24, #0x0]\n" + "b 96f\n" + "92:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 94f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "tbz x11, #1, 93f\n" + "st1 { v8.s }[2], [x28], #0x4\n" + "st1 { v12.s }[2], [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v8.h }[6], [x28]\n" + "st1 { v12.h }[6], [x24]\n" + "b 96f\n" + "93:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 96f\n" + "st1 { v8.h }[4], [x28]\n" + "st1 { v12.h }[4], [x24]\n" + "b 96f\n" + "94:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 95f\n" + "str s8, [x28], #0x4\n" + "str s12, [x24], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v8.h }[2], [x28]\n" + "st1 { v12.h }[2], [x24]\n" + "b 96f\n" + "95:" // Height 2: Partial direct writeback: partial_1_0 + "str h8, [x28, #0x0]\n" + "str h12, [x24, #0x0]\n" + "96:" // Height 2: Partial direct writeback: Done "b 98f\n" - "91:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x16, #0, 98f\n" - "st1 { v9.h }[4], [x13]\n" - "st1 { v13.h }[4], [x9]\n" - "b 98f\n" - "92:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x16, #1, 93f\n" - "str s9, [x13], #0x4\n" - "str s13, [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v9.h }[2], [x13]\n" - "st1 { v13.h }[2], [x9]\n" - "b 98f\n" - "93:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x16, #0, 98f\n" - "str h9, [x13, #0x0]\n" - "str h13, [x9, #0x0]\n" - "b 98f\n" - "94:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x16, #2, 96f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "tbz x16, #1, 95f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v8.h }[6], [x13]\n" - "st1 { v12.h }[6], [x9]\n" - "b 98f\n" - "95:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x16, #0, 98f\n" - "st1 { v8.h }[4], [x13]\n" - "st1 { v12.h }[4], [x9]\n" - "b 98f\n" - "96:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x16, #1, 97f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "tbz x16, #0, 98f\n" - "st1 { v8.h }[2], [x13]\n" - "st1 { v12.h }[2], [x9]\n" - "b 98f\n" - "97:" // Height 2: Partial direct writeback: partial_1_0 - "str h8, [x13, #0x0]\n" - "str h12, [x9, #0x0]\n" - "98:" // Height 2: Partial direct writeback: Done - "b 100f\n" - "99:" // Height 2: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "100:" // Height 2: Writeback done - "subs x16, x16, #0x20\n" - "bgt 53b\n" - "b 302f\n" - "101:" // Height 3 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 102f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "add x27, x27, x19, LSL #1\n" - "b 103f\n" - "102:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "103:" // Height 3: Column loop - "cbz x14, 104f\n" - "ldr q8, [x14, #0x0]\n" + "97:" // Height 2: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "98:" // Height 2: Writeback done + "subs x11, x11, #0x20\n" + "bgt 51b\n" + "b 296f\n" + "99:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "100:" // Height 3: Column loop + "cbz x9, 101f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" - "ldr q11, [x14, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" "mov v13.16b, v9.16b\n" - "add x14, x14, #0x40\n" + "add x9, x9, #0x40\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" - "b 123f\n" - "104:" // Height 3: no bias - "tbz %x[flags], #0, 122f\n" - "cmp x16, #0x20\n" - "bge 121f\n" - "tbz x16, #4, 112f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v9.8h }, [x13], #0x10\n" - "ld1 { v13.8h }, [x9], #0x10\n" - "ld1 { v17.8h }, [x27], #0x10\n" - "tbz x16, #3, 108f\n" - "ld1 { v10.8h }, [x13], #0x10\n" - "ld1 { v14.8h }, [x9], #0x10\n" - "ld1 { v18.8h }, [x27], #0x10\n" - "tbz x16, #2, 106f\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "tbz x16, #1, 105f\n" - "mov x19, #0x3c\n" - "ld1 { v11.s }[2], [x13], #0x4\n" - "ld1 { v15.s }[2], [x9], #0x4\n" - "ld1 { v19.s }[2], [x27], #0x4\n" - "tbz x16, #0, 120f\n" - "ld1 { v11.h }[6], [x13]\n" - "ld1 { v15.h }[6], [x9]\n" - "ld1 { v19.h }[6], [x27]\n" "b 120f\n" - "105:" // Height 3: Partial accumulate: partial_1_28 + "101:" // Height 3: no bias + "tbz %x[flags], #0, 119f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "bge 118f\n" + "tbz x11, #4, 109f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v9.8h }, [x28], #0x10\n" + "ld1 { v13.8h }, [x24], #0x10\n" + "ld1 { v17.8h }, [x23], #0x10\n" + "tbz x11, #3, 105f\n" + "ld1 { v10.8h }, [x28], #0x10\n" + "ld1 { v14.8h }, [x24], #0x10\n" + "ld1 { v18.8h }, [x23], #0x10\n" + "tbz x11, #2, 103f\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "tbz x11, #1, 102f\n" + "mov x19, #0x3c\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v15.s }[2], [x24], #0x4\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v11.h }[6], [x28]\n" + "ld1 { v15.h }[6], [x24]\n" + "ld1 { v19.h }[6], [x23]\n" + "b 117f\n" + "102:" // Height 3: Partial accumulate: partial_1_28 "mov x19, #0x38\n" - "tbz x16, #0, 120f\n" - "ld1 { v11.h }[4], [x13]\n" - "ld1 { v15.h }[4], [x9]\n" - "ld1 { v19.h }[4], [x27]\n" - "b 120f\n" - "106:" // Height 3: Partial accumulate: partial_2_24 - "tbz x16, #1, 107f\n" - "ldr s11, [x13], #0x4\n" - "ldr s15, [x9], #0x4\n" - "ldr s19, [x27], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v11.h }[4], [x28]\n" + "ld1 { v15.h }[4], [x24]\n" + "ld1 { v19.h }[4], [x23]\n" + "b 117f\n" + "103:" // Height 3: Partial accumulate: partial_2_24 + "tbz x11, #1, 104f\n" + "ldr s11, [x28], #0x4\n" + "ldr s15, [x24], #0x4\n" "mov x19, #0x34\n" - "tbz x16, #0, 120f\n" - "ld1 { v11.h }[2], [x13]\n" - "ld1 { v15.h }[2], [x9]\n" - "ld1 { v19.h }[2], [x27]\n" - "b 120f\n" - "107:" // Height 3: Partial accumulate: partial_1_24 + "ldr s19, [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v11.h }[2], [x28]\n" + "ld1 { v15.h }[2], [x24]\n" + "ld1 { v19.h }[2], [x23]\n" + "b 117f\n" + "104:" // Height 3: Partial accumulate: partial_1_24 "mov x19, #0x30\n" - "tbz x16, #0, 120f\n" - "ldr h11, [x13, #0x0]\n" - "ldr h15, [x9, #0x0]\n" - "ldr h19, [x27, #0x0]\n" - "b 120f\n" - "108:" // Height 3: Partial accumulate: partial_4_16 - "tbz x16, #2, 110f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "tbz x16, #1, 109f\n" + "tbz x11, #0, 117f\n" + "ldr h11, [x28, #0x0]\n" + "ldr h15, [x24, #0x0]\n" + "ldr h19, [x23, #0x0]\n" + "b 117f\n" + "105:" // Height 3: Partial accumulate: partial_4_16 + "tbz x11, #2, 107f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "tbz x11, #1, 106f\n" "mov x19, #0x2c\n" - "ld1 { v10.s }[2], [x13], #0x4\n" - "ld1 { v14.s }[2], [x9], #0x4\n" - "ld1 { v18.s }[2], [x27], #0x4\n" - "tbz x16, #0, 120f\n" - "ld1 { v10.h }[6], [x13]\n" - "ld1 { v14.h }[6], [x9]\n" - "ld1 { v18.h }[6], [x27]\n" - "b 120f\n" - "109:" // Height 3: Partial accumulate: partial_1_20 + "ld1 { v10.s }[2], [x28], #0x4\n" + "ld1 { v14.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v10.h }[6], [x28]\n" + "ld1 { v14.h }[6], [x24]\n" + "ld1 { v18.h }[6], [x23]\n" + "b 117f\n" + "106:" // Height 3: Partial accumulate: partial_1_20 "mov x19, #0x28\n" - "tbz x16, #0, 120f\n" - "ld1 { v10.h }[4], [x13]\n" - "ld1 { v14.h }[4], [x9]\n" - "ld1 { v18.h }[4], [x27]\n" - "b 120f\n" - "110:" // Height 3: Partial accumulate: partial_2_16 - "tbz x16, #1, 111f\n" - "ldr s10, [x13], #0x4\n" - "ldr s14, [x9], #0x4\n" - "ldr s18, [x27], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v10.h }[4], [x28]\n" + "ld1 { v14.h }[4], [x24]\n" + "ld1 { v18.h }[4], [x23]\n" + "b 117f\n" + "107:" // Height 3: Partial accumulate: partial_2_16 + "tbz x11, #1, 108f\n" + "ldr s10, [x28], #0x4\n" + "ldr s14, [x24], #0x4\n" "mov x19, #0x24\n" - "tbz x16, #0, 120f\n" - "ld1 { v10.h }[2], [x13]\n" - "ld1 { v14.h }[2], [x9]\n" - "ld1 { v18.h }[2], [x27]\n" - "b 120f\n" - "111:" // Height 3: Partial accumulate: partial_1_16 + "ldr s18, [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v10.h }[2], [x28]\n" + "ld1 { v14.h }[2], [x24]\n" + "ld1 { v18.h }[2], [x23]\n" + "b 117f\n" + "108:" // Height 3: Partial accumulate: partial_1_16 "mov x19, #0x20\n" - "tbz x16, #0, 120f\n" - "ldr h10, [x13, #0x0]\n" - "ldr h14, [x9, #0x0]\n" - "ldr h18, [x27, #0x0]\n" - "b 120f\n" - "112:" // Height 3: Partial accumulate: partial_8_0 - "tbz x16, #3, 116f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "tbz x16, #2, 114f\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "tbz x16, #1, 113f\n" + "tbz x11, #0, 117f\n" + "ldr h10, [x28, #0x0]\n" + "ldr h14, [x24, #0x0]\n" + "ldr h18, [x23, #0x0]\n" + "b 117f\n" + "109:" // Height 3: Partial accumulate: partial_8_0 + "tbz x11, #3, 113f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "tbz x11, #2, 111f\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "tbz x11, #1, 110f\n" "mov x19, #0x1c\n" - "ld1 { v9.s }[2], [x13], #0x4\n" - "ld1 { v13.s }[2], [x9], #0x4\n" - "ld1 { v17.s }[2], [x27], #0x4\n" - "tbz x16, #0, 120f\n" - "ld1 { v9.h }[6], [x13]\n" - "ld1 { v13.h }[6], [x9]\n" - "ld1 { v17.h }[6], [x27]\n" - "b 120f\n" - "113:" // Height 3: Partial accumulate: partial_1_12 + "ld1 { v9.s }[2], [x28], #0x4\n" + "ld1 { v13.s }[2], [x24], #0x4\n" + "ld1 { v17.s }[2], [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v9.h }[6], [x28]\n" + "ld1 { v13.h }[6], [x24]\n" + "ld1 { v17.h }[6], [x23]\n" + "b 117f\n" + "110:" // Height 3: Partial accumulate: partial_1_12 "mov x19, #0x18\n" - "tbz x16, #0, 120f\n" - "ld1 { v9.h }[4], [x13]\n" - "ld1 { v13.h }[4], [x9]\n" - "ld1 { v17.h }[4], [x27]\n" - "b 120f\n" - "114:" // Height 3: Partial accumulate: partial_2_8 - "tbz x16, #1, 115f\n" - "ldr s9, [x13], #0x4\n" - "ldr s13, [x9], #0x4\n" - "ldr s17, [x27], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v9.h }[4], [x28]\n" + "ld1 { v13.h }[4], [x24]\n" + "ld1 { v17.h }[4], [x23]\n" + "b 117f\n" + "111:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 112f\n" + "ldr s9, [x28], #0x4\n" + "ldr s13, [x24], #0x4\n" "mov x19, #0x14\n" - "tbz x16, #0, 120f\n" - "ld1 { v9.h }[2], [x13]\n" - "ld1 { v13.h }[2], [x9]\n" - "ld1 { v17.h }[2], [x27]\n" - "b 120f\n" - "115:" // Height 3: Partial accumulate: partial_1_8 + "ldr s17, [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v9.h }[2], [x28]\n" + "ld1 { v13.h }[2], [x24]\n" + "ld1 { v17.h }[2], [x23]\n" + "b 117f\n" + "112:" // Height 3: Partial accumulate: partial_1_8 "mov x19, #0x10\n" - "tbz x16, #0, 120f\n" - "ldr h9, [x13, #0x0]\n" - "ldr h13, [x9, #0x0]\n" - "ldr h17, [x27, #0x0]\n" - "b 120f\n" - "116:" // Height 3: Partial accumulate: partial_4_0 - "tbz x16, #2, 118f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "tbz x16, #1, 117f\n" + "tbz x11, #0, 117f\n" + "ldr h9, [x28, #0x0]\n" + "ldr h13, [x24, #0x0]\n" + "ldr h17, [x23, #0x0]\n" + "b 117f\n" + "113:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 115f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "tbz x11, #1, 114f\n" "mov x19, #0xc\n" - "ld1 { v8.s }[2], [x13], #0x4\n" - "ld1 { v12.s }[2], [x9], #0x4\n" - "ld1 { v16.s }[2], [x27], #0x4\n" - "tbz x16, #0, 120f\n" - "ld1 { v8.h }[6], [x13]\n" - "ld1 { v12.h }[6], [x9]\n" - "ld1 { v16.h }[6], [x27]\n" - "b 120f\n" - "117:" // Height 3: Partial accumulate: partial_1_4 + "ld1 { v8.s }[2], [x28], #0x4\n" + "ld1 { v12.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v8.h }[6], [x28]\n" + "ld1 { v12.h }[6], [x24]\n" + "ld1 { v16.h }[6], [x23]\n" + "b 117f\n" + "114:" // Height 3: Partial accumulate: partial_1_4 "mov x19, #0x8\n" - "tbz x16, #0, 120f\n" - "ld1 { v8.h }[4], [x13]\n" - "ld1 { v12.h }[4], [x9]\n" - "ld1 { v16.h }[4], [x27]\n" - "b 120f\n" - "118:" // Height 3: Partial accumulate: partial_2_0 - "tbz x16, #1, 119f\n" - "ldr s8, [x13], #0x4\n" - "ldr s12, [x9], #0x4\n" - "ldr s16, [x27], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v8.h }[4], [x28]\n" + "ld1 { v12.h }[4], [x24]\n" + "ld1 { v16.h }[4], [x23]\n" + "b 117f\n" + "115:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 116f\n" + "ldr s8, [x28], #0x4\n" + "ldr s12, [x24], #0x4\n" "mov x19, #0x4\n" - "tbz x16, #0, 120f\n" - "ld1 { v8.h }[2], [x13]\n" - "ld1 { v12.h }[2], [x9]\n" - "ld1 { v16.h }[2], [x27]\n" - "b 120f\n" - "119:" // Height 3: Partial accumulate: partial_1_0 + "ldr s16, [x23], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v8.h }[2], [x28]\n" + "ld1 { v12.h }[2], [x24]\n" + "ld1 { v16.h }[2], [x23]\n" + "b 117f\n" + "116:" // Height 3: Partial accumulate: partial_1_0 + "ldr h8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr h8, [x13, #0x0]\n" - "ldr h12, [x9, #0x0]\n" - "ldr h16, [x27, #0x0]\n" - "120:" // Height 3: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "b 123f\n" - "121:" // Height 3: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "b 123f\n" - "122:" // Height 3: no accumulate + "ldr h12, [x24, #0x0]\n" + "ldr h16, [x23, #0x0]\n" + "117:" // Height 3: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 120f\n" + "118:" // Height 3: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "b 120f\n" + "119:" // Height 3: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -1413,349 +1390,352 @@ void a64_hybrid_fp16_mla_6x32 ( "movi v17.16b, #0x0\n" "movi v18.16b, #0x0\n" "movi v19.16b, #0x0\n" - "123:" // Height 3: setup done - "mov x12, #0x0\n" - "124:" // Height 3: String loop + "120:" // Height 3: setup done + "mov x27, #0x0\n" + "121:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 125f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 122f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 126f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 123f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" - "b 126f\n" - "125:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "126:" // Height 3: input setup done - "cmp x11, #0x8\n" - "blt 129f\n" - "cmp x11, #0x10\n" - "blt 128f\n" - "127:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 123f\n" + "122:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "123:" // Height 3: input setup done + "cmp x26, #0x8\n" + "blt 126f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 125f\n" + "124:" // Height 3: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x26, x26, #0x8\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x26, #0x10\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "sub x11, x11, #0x8\n" "fmla v14.8h, v6.8h, v1.h[0]\n" - "cmp x11, #0x10\n" "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" - "bge 127b\n" - "128:" // Height 3: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "bge 124b\n" + "125:" // Height 3: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "add x24, x24, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" - "129:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 131f\n" - "130:" // Height 3: Multiply loop: Odd block loop - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr q6, [x15, #0x0]\n" + "126:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 128f\n" + "127:" // Height 3: Multiply loop: Odd block loop + "ldr h0, [x25], #0x2\n" + "sub x26, x26, #0x1\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" - "cbnz x11, 130b\n" - "131:" // Height 3: Multiply loop: No odd multiplies + "cbnz x26, 127b\n" + "128:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 124b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "tbz %x[flags], #1, 132f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 121b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 129f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.8h }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1784,217 +1764,200 @@ void a64_hybrid_fp16_mla_6x32 ( "fmin v19.8h, v19.8h, v0.8h\n" "fmax v18.8h, v18.8h, v1.8h\n" "fmax v19.8h, v19.8h, v1.8h\n" - "132:" // Height 3: No activation - "cmp x16, #0x20\n" - "bge 149f\n" - "tbz x16, #4, 140f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v9.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v13.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v17.8h }, [x27], #0x10\n" - "tbz x16, #3, 136f\n" - "st1 { v10.8h }, [x13], #0x10\n" - "st1 { v14.8h }, [x9], #0x10\n" - "st1 { v18.8h }, [x27], #0x10\n" - "tbz x16, #2, 134f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "tbz x16, #1, 133f\n" - "st1 { v11.s }[2], [x13], #0x4\n" - "st1 { v15.s }[2], [x9], #0x4\n" - "st1 { v19.s }[2], [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v11.h }[6], [x13]\n" - "st1 { v15.h }[6], [x9]\n" - "st1 { v19.h }[6], [x27]\n" - "b 148f\n" - "133:" // Height 3: Partial direct writeback: partial_1_28 - "tbz x16, #0, 148f\n" - "st1 { v11.h }[4], [x13]\n" - "st1 { v15.h }[4], [x9]\n" - "st1 { v19.h }[4], [x27]\n" - "b 148f\n" - "134:" // Height 3: Partial direct writeback: partial_2_24 - "tbz x16, #1, 135f\n" - "str s11, [x13], #0x4\n" - "str s15, [x9], #0x4\n" - "str s19, [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v11.h }[2], [x13]\n" - "st1 { v15.h }[2], [x9]\n" - "st1 { v19.h }[2], [x27]\n" - "b 148f\n" - "135:" // Height 3: Partial direct writeback: partial_1_24 - "tbz x16, #0, 148f\n" - "str h11, [x13, #0x0]\n" - "str h15, [x9, #0x0]\n" - "str h19, [x27, #0x0]\n" - "b 148f\n" - "136:" // Height 3: Partial direct writeback: partial_4_16 - "tbz x16, #2, 138f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "tbz x16, #1, 137f\n" - "st1 { v10.s }[2], [x13], #0x4\n" - "st1 { v14.s }[2], [x9], #0x4\n" - "st1 { v18.s }[2], [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v10.h }[6], [x13]\n" - "st1 { v14.h }[6], [x9]\n" - "st1 { v18.h }[6], [x27]\n" - "b 148f\n" - "137:" // Height 3: Partial direct writeback: partial_1_20 - "tbz x16, #0, 148f\n" - "st1 { v10.h }[4], [x13]\n" - "st1 { v14.h }[4], [x9]\n" - "st1 { v18.h }[4], [x27]\n" - "b 148f\n" - "138:" // Height 3: Partial direct writeback: partial_2_16 - "tbz x16, #1, 139f\n" - "str s10, [x13], #0x4\n" - "str s14, [x9], #0x4\n" - "str s18, [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v10.h }[2], [x13]\n" - "st1 { v14.h }[2], [x9]\n" - "st1 { v18.h }[2], [x27]\n" - "b 148f\n" - "139:" // Height 3: Partial direct writeback: partial_1_16 - "tbz x16, #0, 148f\n" - "str h10, [x13, #0x0]\n" - "str h14, [x9, #0x0]\n" - "str h18, [x27, #0x0]\n" - "b 148f\n" - "140:" // Height 3: Partial direct writeback: partial_8_0 - "tbz x16, #3, 144f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "tbz x16, #2, 142f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "tbz x16, #1, 141f\n" - "st1 { v9.s }[2], [x13], #0x4\n" - "st1 { v13.s }[2], [x9], #0x4\n" - "st1 { v17.s }[2], [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v9.h }[6], [x13]\n" - "st1 { v13.h }[6], [x9]\n" - "st1 { v17.h }[6], [x27]\n" - "b 148f\n" - "141:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x16, #0, 148f\n" - "st1 { v9.h }[4], [x13]\n" - "st1 { v13.h }[4], [x9]\n" - "st1 { v17.h }[4], [x27]\n" - "b 148f\n" - "142:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x16, #1, 143f\n" - "str s9, [x13], #0x4\n" - "str s13, [x9], #0x4\n" - "str s17, [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v9.h }[2], [x13]\n" - "st1 { v13.h }[2], [x9]\n" - "st1 { v17.h }[2], [x27]\n" - "b 148f\n" - "143:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x16, #0, 148f\n" - "str h9, [x13, #0x0]\n" - "str h13, [x9, #0x0]\n" - "str h17, [x27, #0x0]\n" - "b 148f\n" - "144:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x16, #2, 146f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "tbz x16, #1, 145f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v8.h }[6], [x13]\n" - "st1 { v12.h }[6], [x9]\n" - "st1 { v16.h }[6], [x27]\n" - "b 148f\n" - "145:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x16, #0, 148f\n" - "st1 { v8.h }[4], [x13]\n" - "st1 { v12.h }[4], [x9]\n" - "st1 { v16.h }[4], [x27]\n" - "b 148f\n" - "146:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x16, #1, 147f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "tbz x16, #0, 148f\n" - "st1 { v8.h }[2], [x13]\n" - "st1 { v12.h }[2], [x9]\n" - "st1 { v16.h }[2], [x27]\n" - "b 148f\n" - "147:" // Height 3: Partial direct writeback: partial_1_0 - "str h8, [x13, #0x0]\n" - "str h12, [x9, #0x0]\n" - "str h16, [x27, #0x0]\n" - "148:" // Height 3: Partial direct writeback: Done - "b 150f\n" - "149:" // Height 3: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "150:" // Height 3: Writeback done - "subs x16, x16, #0x20\n" - "bgt 103b\n" - "b 302f\n" - "151:" // Height 4 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 152f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #1\n" - "add x25, x25, x19, LSL #1\n" - "b 153f\n" - "152:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "add x25, x27, x19, LSL #1\n" - "153:" // Height 4: Column loop - "cbz x14, 154f\n" - "ldr q8, [x14, #0x0]\n" + "129:" // Height 3: No activation + "cmp x11, #0x20\n" + "bge 146f\n" + "tbz x11, #4, 137f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v9.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v13.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v17.8h }, [x23], #0x10\n" + "tbz x11, #3, 133f\n" + "st1 { v10.8h }, [x28], #0x10\n" + "st1 { v14.8h }, [x24], #0x10\n" + "st1 { v18.8h }, [x23], #0x10\n" + "tbz x11, #2, 131f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "tbz x11, #1, 130f\n" + "st1 { v11.s }[2], [x28], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v19.s }[2], [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v11.h }[6], [x28]\n" + "st1 { v15.h }[6], [x24]\n" + "st1 { v19.h }[6], [x23]\n" + "b 145f\n" + "130:" // Height 3: Partial direct writeback: partial_1_28 + "tbz x11, #0, 145f\n" + "st1 { v11.h }[4], [x28]\n" + "st1 { v15.h }[4], [x24]\n" + "st1 { v19.h }[4], [x23]\n" + "b 145f\n" + "131:" // Height 3: Partial direct writeback: partial_2_24 + "tbz x11, #1, 132f\n" + "str s11, [x28], #0x4\n" + "str s15, [x24], #0x4\n" + "str s19, [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v11.h }[2], [x28]\n" + "st1 { v15.h }[2], [x24]\n" + "st1 { v19.h }[2], [x23]\n" + "b 145f\n" + "132:" // Height 3: Partial direct writeback: partial_1_24 + "tbz x11, #0, 145f\n" + "str h11, [x28, #0x0]\n" + "str h15, [x24, #0x0]\n" + "str h19, [x23, #0x0]\n" + "b 145f\n" + "133:" // Height 3: Partial direct writeback: partial_4_16 + "tbz x11, #2, 135f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "tbz x11, #1, 134f\n" + "st1 { v10.s }[2], [x28], #0x4\n" + "st1 { v14.s }[2], [x24], #0x4\n" + "st1 { v18.s }[2], [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v10.h }[6], [x28]\n" + "st1 { v14.h }[6], [x24]\n" + "st1 { v18.h }[6], [x23]\n" + "b 145f\n" + "134:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x11, #0, 145f\n" + "st1 { v10.h }[4], [x28]\n" + "st1 { v14.h }[4], [x24]\n" + "st1 { v18.h }[4], [x23]\n" + "b 145f\n" + "135:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x11, #1, 136f\n" + "str s10, [x28], #0x4\n" + "str s14, [x24], #0x4\n" + "str s18, [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v10.h }[2], [x28]\n" + "st1 { v14.h }[2], [x24]\n" + "st1 { v18.h }[2], [x23]\n" + "b 145f\n" + "136:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x11, #0, 145f\n" + "str h10, [x28, #0x0]\n" + "str h14, [x24, #0x0]\n" + "str h18, [x23, #0x0]\n" + "b 145f\n" + "137:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x11, #3, 141f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "tbz x11, #2, 139f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "tbz x11, #1, 138f\n" + "st1 { v9.s }[2], [x28], #0x4\n" + "st1 { v13.s }[2], [x24], #0x4\n" + "st1 { v17.s }[2], [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v9.h }[6], [x28]\n" + "st1 { v13.h }[6], [x24]\n" + "st1 { v17.h }[6], [x23]\n" + "b 145f\n" + "138:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 145f\n" + "st1 { v9.h }[4], [x28]\n" + "st1 { v13.h }[4], [x24]\n" + "st1 { v17.h }[4], [x23]\n" + "b 145f\n" + "139:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 140f\n" + "str s9, [x28], #0x4\n" + "str s13, [x24], #0x4\n" + "str s17, [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v9.h }[2], [x28]\n" + "st1 { v13.h }[2], [x24]\n" + "st1 { v17.h }[2], [x23]\n" + "b 145f\n" + "140:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 145f\n" + "str h9, [x28, #0x0]\n" + "str h13, [x24, #0x0]\n" + "str h17, [x23, #0x0]\n" + "b 145f\n" + "141:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 143f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "tbz x11, #1, 142f\n" + "st1 { v8.s }[2], [x28], #0x4\n" + "st1 { v12.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v8.h }[6], [x28]\n" + "st1 { v12.h }[6], [x24]\n" + "st1 { v16.h }[6], [x23]\n" + "b 145f\n" + "142:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 145f\n" + "st1 { v8.h }[4], [x28]\n" + "st1 { v12.h }[4], [x24]\n" + "st1 { v16.h }[4], [x23]\n" + "b 145f\n" + "143:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 144f\n" + "str s8, [x28], #0x4\n" + "str s12, [x24], #0x4\n" + "str s16, [x23], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v8.h }[2], [x28]\n" + "st1 { v12.h }[2], [x24]\n" + "st1 { v16.h }[2], [x23]\n" + "b 145f\n" + "144:" // Height 3: Partial direct writeback: partial_1_0 + "str h8, [x28, #0x0]\n" + "str h12, [x24, #0x0]\n" + "str h16, [x23, #0x0]\n" + "145:" // Height 3: Partial direct writeback: Done + "b 147f\n" + "146:" // Height 3: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "147:" // Height 3: Writeback done + "subs x11, x11, #0x20\n" + "bgt 100b\n" + "b 296f\n" + "148:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "149:" // Height 4: Column loop + "cbz x9, 150f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" - "add x14, x14, #0x40\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v13.16b, v9.16b\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" @@ -2004,240 +1967,241 @@ void a64_hybrid_fp16_mla_6x32 ( "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" "mov v23.16b, v11.16b\n" - "b 173f\n" - "154:" // Height 4: no bias - "tbz %x[flags], #0, 172f\n" - "cmp x16, #0x20\n" - "bge 171f\n" - "tbz x16, #4, 162f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v20.8h }, [x25], #0x10\n" - "ld1 { v9.8h }, [x13], #0x10\n" - "ld1 { v13.8h }, [x9], #0x10\n" - "ld1 { v17.8h }, [x27], #0x10\n" - "ld1 { v21.8h }, [x25], #0x10\n" - "tbz x16, #3, 158f\n" - "ld1 { v10.8h }, [x13], #0x10\n" - "ld1 { v14.8h }, [x9], #0x10\n" - "ld1 { v18.8h }, [x27], #0x10\n" - "ld1 { v22.8h }, [x25], #0x10\n" - "tbz x16, #2, 156f\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "tbz x16, #1, 155f\n" + "b 169f\n" + "150:" // Height 4: no bias + "tbz %x[flags], #0, 168f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "bge 167f\n" + "tbz x11, #4, 158f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v20.8h }, [x22], #0x10\n" + "ld1 { v9.8h }, [x28], #0x10\n" + "ld1 { v13.8h }, [x24], #0x10\n" + "ld1 { v17.8h }, [x23], #0x10\n" + "ld1 { v21.8h }, [x22], #0x10\n" + "tbz x11, #3, 154f\n" + "ld1 { v10.8h }, [x28], #0x10\n" + "ld1 { v14.8h }, [x24], #0x10\n" + "ld1 { v18.8h }, [x23], #0x10\n" + "ld1 { v22.8h }, [x22], #0x10\n" + "tbz x11, #2, 152f\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "tbz x11, #1, 151f\n" "mov x19, #0x3c\n" - "ld1 { v11.s }[2], [x13], #0x4\n" - "ld1 { v15.s }[2], [x9], #0x4\n" - "ld1 { v19.s }[2], [x27], #0x4\n" - "ld1 { v23.s }[2], [x25], #0x4\n" - "tbz x16, #0, 170f\n" - "ld1 { v11.h }[6], [x13]\n" - "ld1 { v15.h }[6], [x9]\n" - "ld1 { v19.h }[6], [x27]\n" - "ld1 { v23.h }[6], [x25]\n" - "b 170f\n" - "155:" // Height 4: Partial accumulate: partial_1_28 + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v15.s }[2], [x24], #0x4\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v11.h }[6], [x28]\n" + "ld1 { v15.h }[6], [x24]\n" + "ld1 { v19.h }[6], [x23]\n" + "ld1 { v23.h }[6], [x22]\n" + "b 166f\n" + "151:" // Height 4: Partial accumulate: partial_1_28 "mov x19, #0x38\n" - "tbz x16, #0, 170f\n" - "ld1 { v11.h }[4], [x13]\n" - "ld1 { v15.h }[4], [x9]\n" - "ld1 { v19.h }[4], [x27]\n" - "ld1 { v23.h }[4], [x25]\n" - "b 170f\n" - "156:" // Height 4: Partial accumulate: partial_2_24 - "tbz x16, #1, 157f\n" - "ldr s11, [x13], #0x4\n" - "ldr s15, [x9], #0x4\n" - "ldr s19, [x27], #0x4\n" - "ldr s23, [x25], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v11.h }[4], [x28]\n" + "ld1 { v15.h }[4], [x24]\n" + "ld1 { v19.h }[4], [x23]\n" + "ld1 { v23.h }[4], [x22]\n" + "b 166f\n" + "152:" // Height 4: Partial accumulate: partial_2_24 + "tbz x11, #1, 153f\n" + "ldr s11, [x28], #0x4\n" + "ldr s15, [x24], #0x4\n" "mov x19, #0x34\n" - "tbz x16, #0, 170f\n" - "ld1 { v11.h }[2], [x13]\n" - "ld1 { v15.h }[2], [x9]\n" - "ld1 { v19.h }[2], [x27]\n" - "ld1 { v23.h }[2], [x25]\n" - "b 170f\n" - "157:" // Height 4: Partial accumulate: partial_1_24 + "ldr s19, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v11.h }[2], [x28]\n" + "ld1 { v15.h }[2], [x24]\n" + "ld1 { v19.h }[2], [x23]\n" + "ld1 { v23.h }[2], [x22]\n" + "b 166f\n" + "153:" // Height 4: Partial accumulate: partial_1_24 "mov x19, #0x30\n" - "tbz x16, #0, 170f\n" - "ldr h11, [x13, #0x0]\n" - "ldr h15, [x9, #0x0]\n" - "ldr h19, [x27, #0x0]\n" - "ldr h23, [x25, #0x0]\n" - "b 170f\n" - "158:" // Height 4: Partial accumulate: partial_4_16 - "tbz x16, #2, 160f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "tbz x16, #1, 159f\n" - "ld1 { v10.s }[2], [x13], #0x4\n" - "ld1 { v14.s }[2], [x9], #0x4\n" - "ld1 { v18.s }[2], [x27], #0x4\n" - "ld1 { v22.s }[2], [x25], #0x4\n" + "tbz x11, #0, 166f\n" + "ldr h11, [x28, #0x0]\n" + "ldr h15, [x24, #0x0]\n" + "ldr h19, [x23, #0x0]\n" + "ldr h23, [x22, #0x0]\n" + "b 166f\n" + "154:" // Height 4: Partial accumulate: partial_4_16 + "tbz x11, #2, 156f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "tbz x11, #1, 155f\n" "mov x19, #0x2c\n" - "tbz x16, #0, 170f\n" - "ld1 { v10.h }[6], [x13]\n" - "ld1 { v14.h }[6], [x9]\n" - "ld1 { v18.h }[6], [x27]\n" - "ld1 { v22.h }[6], [x25]\n" - "b 170f\n" - "159:" // Height 4: Partial accumulate: partial_1_20 + "ld1 { v10.s }[2], [x28], #0x4\n" + "ld1 { v14.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "ld1 { v22.s }[2], [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v10.h }[6], [x28]\n" + "ld1 { v14.h }[6], [x24]\n" + "ld1 { v18.h }[6], [x23]\n" + "ld1 { v22.h }[6], [x22]\n" + "b 166f\n" + "155:" // Height 4: Partial accumulate: partial_1_20 "mov x19, #0x28\n" - "tbz x16, #0, 170f\n" - "ld1 { v10.h }[4], [x13]\n" - "ld1 { v14.h }[4], [x9]\n" - "ld1 { v18.h }[4], [x27]\n" - "ld1 { v22.h }[4], [x25]\n" - "b 170f\n" - "160:" // Height 4: Partial accumulate: partial_2_16 - "tbz x16, #1, 161f\n" - "ldr s10, [x13], #0x4\n" - "ldr s14, [x9], #0x4\n" - "ldr s18, [x27], #0x4\n" - "ldr s22, [x25], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v10.h }[4], [x28]\n" + "ld1 { v14.h }[4], [x24]\n" + "ld1 { v18.h }[4], [x23]\n" + "ld1 { v22.h }[4], [x22]\n" + "b 166f\n" + "156:" // Height 4: Partial accumulate: partial_2_16 + "tbz x11, #1, 157f\n" + "ldr s10, [x28], #0x4\n" + "ldr s14, [x24], #0x4\n" "mov x19, #0x24\n" - "tbz x16, #0, 170f\n" - "ld1 { v10.h }[2], [x13]\n" - "ld1 { v14.h }[2], [x9]\n" - "ld1 { v18.h }[2], [x27]\n" - "ld1 { v22.h }[2], [x25]\n" - "b 170f\n" - "161:" // Height 4: Partial accumulate: partial_1_16 + "ldr s18, [x23], #0x4\n" + "ldr s22, [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v10.h }[2], [x28]\n" + "ld1 { v14.h }[2], [x24]\n" + "ld1 { v18.h }[2], [x23]\n" + "ld1 { v22.h }[2], [x22]\n" + "b 166f\n" + "157:" // Height 4: Partial accumulate: partial_1_16 "mov x19, #0x20\n" - "tbz x16, #0, 170f\n" - "ldr h10, [x13, #0x0]\n" - "ldr h14, [x9, #0x0]\n" - "ldr h18, [x27, #0x0]\n" - "ldr h22, [x25, #0x0]\n" - "b 170f\n" - "162:" // Height 4: Partial accumulate: partial_8_0 - "tbz x16, #3, 166f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v20.8h }, [x25], #0x10\n" - "tbz x16, #2, 164f\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "tbz x16, #1, 163f\n" + "tbz x11, #0, 166f\n" + "ldr h10, [x28, #0x0]\n" + "ldr h14, [x24, #0x0]\n" + "ldr h18, [x23, #0x0]\n" + "ldr h22, [x22, #0x0]\n" + "b 166f\n" + "158:" // Height 4: Partial accumulate: partial_8_0 + "tbz x11, #3, 162f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v20.8h }, [x22], #0x10\n" + "tbz x11, #2, 160f\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "tbz x11, #1, 159f\n" "mov x19, #0x1c\n" - "ld1 { v9.s }[2], [x13], #0x4\n" - "ld1 { v13.s }[2], [x9], #0x4\n" - "ld1 { v17.s }[2], [x27], #0x4\n" - "ld1 { v21.s }[2], [x25], #0x4\n" - "tbz x16, #0, 170f\n" - "ld1 { v9.h }[6], [x13]\n" - "ld1 { v13.h }[6], [x9]\n" - "ld1 { v17.h }[6], [x27]\n" - "ld1 { v21.h }[6], [x25]\n" - "b 170f\n" - "163:" // Height 4: Partial accumulate: partial_1_12 + "ld1 { v9.s }[2], [x28], #0x4\n" + "ld1 { v13.s }[2], [x24], #0x4\n" + "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v9.h }[6], [x28]\n" + "ld1 { v13.h }[6], [x24]\n" + "ld1 { v17.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "b 166f\n" + "159:" // Height 4: Partial accumulate: partial_1_12 "mov x19, #0x18\n" - "tbz x16, #0, 170f\n" - "ld1 { v9.h }[4], [x13]\n" - "ld1 { v13.h }[4], [x9]\n" - "ld1 { v17.h }[4], [x27]\n" - "ld1 { v21.h }[4], [x25]\n" - "b 170f\n" - "164:" // Height 4: Partial accumulate: partial_2_8 - "tbz x16, #1, 165f\n" - "ldr s9, [x13], #0x4\n" - "ldr s13, [x9], #0x4\n" - "ldr s17, [x27], #0x4\n" - "ldr s21, [x25], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v9.h }[4], [x28]\n" + "ld1 { v13.h }[4], [x24]\n" + "ld1 { v17.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "b 166f\n" + "160:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 161f\n" + "ldr s9, [x28], #0x4\n" + "ldr s13, [x24], #0x4\n" "mov x19, #0x14\n" - "tbz x16, #0, 170f\n" - "ld1 { v9.h }[2], [x13]\n" - "ld1 { v13.h }[2], [x9]\n" - "ld1 { v17.h }[2], [x27]\n" - "ld1 { v21.h }[2], [x25]\n" - "b 170f\n" - "165:" // Height 4: Partial accumulate: partial_1_8 + "ldr s17, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v9.h }[2], [x28]\n" + "ld1 { v13.h }[2], [x24]\n" + "ld1 { v17.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "b 166f\n" + "161:" // Height 4: Partial accumulate: partial_1_8 "mov x19, #0x10\n" - "tbz x16, #0, 170f\n" - "ldr h9, [x13, #0x0]\n" - "ldr h13, [x9, #0x0]\n" - "ldr h17, [x27, #0x0]\n" - "ldr h21, [x25, #0x0]\n" - "b 170f\n" - "166:" // Height 4: Partial accumulate: partial_4_0 - "tbz x16, #2, 168f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "tbz x16, #1, 167f\n" - "ld1 { v8.s }[2], [x13], #0x4\n" - "ld1 { v12.s }[2], [x9], #0x4\n" - "ld1 { v16.s }[2], [x27], #0x4\n" - "ld1 { v20.s }[2], [x25], #0x4\n" + "tbz x11, #0, 166f\n" + "ldr h9, [x28, #0x0]\n" + "ldr h13, [x24, #0x0]\n" + "ldr h17, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "b 166f\n" + "162:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 164f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "tbz x11, #1, 163f\n" "mov x19, #0xc\n" - "tbz x16, #0, 170f\n" - "ld1 { v8.h }[6], [x13]\n" - "ld1 { v12.h }[6], [x9]\n" - "ld1 { v16.h }[6], [x27]\n" - "ld1 { v20.h }[6], [x25]\n" - "b 170f\n" - "167:" // Height 4: Partial accumulate: partial_1_4 + "ld1 { v8.s }[2], [x28], #0x4\n" + "ld1 { v12.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v8.h }[6], [x28]\n" + "ld1 { v12.h }[6], [x24]\n" + "ld1 { v16.h }[6], [x23]\n" + "ld1 { v20.h }[6], [x22]\n" + "b 166f\n" + "163:" // Height 4: Partial accumulate: partial_1_4 "mov x19, #0x8\n" - "tbz x16, #0, 170f\n" - "ld1 { v8.h }[4], [x13]\n" - "ld1 { v12.h }[4], [x9]\n" - "ld1 { v16.h }[4], [x27]\n" - "ld1 { v20.h }[4], [x25]\n" - "b 170f\n" - "168:" // Height 4: Partial accumulate: partial_2_0 - "tbz x16, #1, 169f\n" - "ldr s8, [x13], #0x4\n" - "ldr s12, [x9], #0x4\n" - "ldr s16, [x27], #0x4\n" - "ldr s20, [x25], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v8.h }[4], [x28]\n" + "ld1 { v12.h }[4], [x24]\n" + "ld1 { v16.h }[4], [x23]\n" + "ld1 { v20.h }[4], [x22]\n" + "b 166f\n" + "164:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 165f\n" + "ldr s8, [x28], #0x4\n" + "ldr s12, [x24], #0x4\n" "mov x19, #0x4\n" - "tbz x16, #0, 170f\n" - "ld1 { v8.h }[2], [x13]\n" - "ld1 { v12.h }[2], [x9]\n" - "ld1 { v16.h }[2], [x27]\n" - "ld1 { v20.h }[2], [x25]\n" - "b 170f\n" - "169:" // Height 4: Partial accumulate: partial_1_0 + "ldr s16, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v8.h }[2], [x28]\n" + "ld1 { v12.h }[2], [x24]\n" + "ld1 { v16.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "b 166f\n" + "165:" // Height 4: Partial accumulate: partial_1_0 + "ldr h8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr h8, [x13, #0x0]\n" - "ldr h12, [x9, #0x0]\n" - "ldr h16, [x27, #0x0]\n" - "ldr h20, [x25, #0x0]\n" - "170:" // Height 4: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "b 173f\n" - "171:" // Height 4: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "b 173f\n" - "172:" // Height 4: no accumulate + "ldr h12, [x24, #0x0]\n" + "ldr h16, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "166:" // Height 4: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 169f\n" + "167:" // Height 4: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "b 169f\n" + "168:" // Height 4: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -2254,380 +2218,380 @@ void a64_hybrid_fp16_mla_6x32 ( "movi v21.16b, #0x0\n" "movi v22.16b, #0x0\n" "movi v23.16b, #0x0\n" - "173:" // Height 4: setup done - "mov x12, #0x0\n" - "174:" // Height 4: String loop + "169:" // Height 4: setup done + "mov x27, #0x0\n" + "170:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 175f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 171f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 176f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 172f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" - "b 176f\n" - "175:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "176:" // Height 4: input setup done - "cmp x11, #0x8\n" - "blt 179f\n" - "cmp x11, #0x10\n" - "blt 178f\n" - "177:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 172f\n" + "171:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "172:" // Height 4: input setup done + "cmp x26, #0x8\n" + "blt 175f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 174f\n" + "173:" // Height 4: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x26, x26, #0x8\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x10\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x11, x11, #0x8\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "cmp x11, #0x10\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v22.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x23, #0x0]\n" "fmla v23.8h, v7.8h, v3.h[7]\n" - "bge 177b\n" - "178:" // Height 4: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "bge 173b\n" + "174:" // Height 4: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" - "add x15, x15, #0x200\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" @@ -2636,28 +2600,28 @@ void a64_hybrid_fp16_mla_6x32 ( "fmla v15.8h, v7.8h, v1.h[7]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" "fmla v23.8h, v7.8h, v3.h[7]\n" - "179:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 181f\n" - "180:" // Height 4: Multiply loop: Odd block loop - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr q6, [x15, #0x0]\n" + "175:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 177f\n" + "176:" // Height 4: Multiply loop: Odd block loop + "ldr h0, [x25], #0x2\n" + "sub x26, x26, #0x1\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" @@ -2665,17 +2629,21 @@ void a64_hybrid_fp16_mla_6x32 ( "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" - "cbnz x11, 180b\n" - "181:" // Height 4: Multiply loop: No odd multiplies + "cbnz x26, 176b\n" + "177:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 174b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "tbz %x[flags], #1, 182f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 170b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #1\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 178f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.8h }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2712,258 +2680,237 @@ void a64_hybrid_fp16_mla_6x32 ( "fmax v21.8h, v21.8h, v1.8h\n" "fmax v22.8h, v22.8h, v1.8h\n" "fmax v23.8h, v23.8h, v1.8h\n" - "182:" // Height 4: No activation - "cmp x16, #0x20\n" - "bge 199f\n" - "tbz x16, #4, 190f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v9.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v13.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v17.8h }, [x27], #0x10\n" - "st1 { v20.8h }, [x25], #0x10\n" - "st1 { v21.8h }, [x25], #0x10\n" - "tbz x16, #3, 186f\n" - "st1 { v10.8h }, [x13], #0x10\n" - "st1 { v14.8h }, [x9], #0x10\n" - "st1 { v18.8h }, [x27], #0x10\n" - "st1 { v22.8h }, [x25], #0x10\n" - "tbz x16, #2, 184f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "tbz x16, #1, 183f\n" - "st1 { v11.s }[2], [x13], #0x4\n" - "st1 { v15.s }[2], [x9], #0x4\n" - "st1 { v19.s }[2], [x27], #0x4\n" - "st1 { v23.s }[2], [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v11.h }[6], [x13]\n" - "st1 { v15.h }[6], [x9]\n" - "st1 { v19.h }[6], [x27]\n" - "st1 { v23.h }[6], [x25]\n" - "b 198f\n" - "183:" // Height 4: Partial direct writeback: partial_1_28 - "tbz x16, #0, 198f\n" - "st1 { v11.h }[4], [x13]\n" - "st1 { v15.h }[4], [x9]\n" - "st1 { v19.h }[4], [x27]\n" - "st1 { v23.h }[4], [x25]\n" - "b 198f\n" - "184:" // Height 4: Partial direct writeback: partial_2_24 - "tbz x16, #1, 185f\n" - "str s11, [x13], #0x4\n" - "str s15, [x9], #0x4\n" - "str s19, [x27], #0x4\n" - "str s23, [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v11.h }[2], [x13]\n" - "st1 { v15.h }[2], [x9]\n" - "st1 { v19.h }[2], [x27]\n" - "st1 { v23.h }[2], [x25]\n" - "b 198f\n" - "185:" // Height 4: Partial direct writeback: partial_1_24 - "tbz x16, #0, 198f\n" - "str h11, [x13, #0x0]\n" - "str h15, [x9, #0x0]\n" - "str h19, [x27, #0x0]\n" - "str h23, [x25, #0x0]\n" - "b 198f\n" - "186:" // Height 4: Partial direct writeback: partial_4_16 - "tbz x16, #2, 188f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "tbz x16, #1, 187f\n" - "st1 { v10.s }[2], [x13], #0x4\n" - "st1 { v14.s }[2], [x9], #0x4\n" - "st1 { v18.s }[2], [x27], #0x4\n" - "st1 { v22.s }[2], [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v10.h }[6], [x13]\n" - "st1 { v14.h }[6], [x9]\n" - "st1 { v18.h }[6], [x27]\n" - "st1 { v22.h }[6], [x25]\n" - "b 198f\n" - "187:" // Height 4: Partial direct writeback: partial_1_20 - "tbz x16, #0, 198f\n" - "st1 { v10.h }[4], [x13]\n" - "st1 { v14.h }[4], [x9]\n" - "st1 { v18.h }[4], [x27]\n" - "st1 { v22.h }[4], [x25]\n" - "b 198f\n" - "188:" // Height 4: Partial direct writeback: partial_2_16 - "tbz x16, #1, 189f\n" - "str s10, [x13], #0x4\n" - "str s14, [x9], #0x4\n" - "str s18, [x27], #0x4\n" - "str s22, [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v10.h }[2], [x13]\n" - "st1 { v14.h }[2], [x9]\n" - "st1 { v18.h }[2], [x27]\n" - "st1 { v22.h }[2], [x25]\n" - "b 198f\n" - "189:" // Height 4: Partial direct writeback: partial_1_16 - "tbz x16, #0, 198f\n" - "str h10, [x13, #0x0]\n" - "str h14, [x9, #0x0]\n" - "str h18, [x27, #0x0]\n" - "str h22, [x25, #0x0]\n" - "b 198f\n" - "190:" // Height 4: Partial direct writeback: partial_8_0 - "tbz x16, #3, 194f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v20.8h }, [x25], #0x10\n" - "tbz x16, #2, 192f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "tbz x16, #1, 191f\n" - "st1 { v9.s }[2], [x13], #0x4\n" - "st1 { v13.s }[2], [x9], #0x4\n" - "st1 { v17.s }[2], [x27], #0x4\n" - "st1 { v21.s }[2], [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v9.h }[6], [x13]\n" - "st1 { v13.h }[6], [x9]\n" - "st1 { v17.h }[6], [x27]\n" - "st1 { v21.h }[6], [x25]\n" - "b 198f\n" - "191:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x16, #0, 198f\n" - "st1 { v9.h }[4], [x13]\n" - "st1 { v13.h }[4], [x9]\n" - "st1 { v17.h }[4], [x27]\n" - "st1 { v21.h }[4], [x25]\n" - "b 198f\n" - "192:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x16, #1, 193f\n" - "str s9, [x13], #0x4\n" - "str s13, [x9], #0x4\n" - "str s17, [x27], #0x4\n" - "str s21, [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v9.h }[2], [x13]\n" - "st1 { v13.h }[2], [x9]\n" - "st1 { v17.h }[2], [x27]\n" - "st1 { v21.h }[2], [x25]\n" - "b 198f\n" - "193:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x16, #0, 198f\n" - "str h9, [x13, #0x0]\n" - "str h13, [x9, #0x0]\n" - "str h17, [x27, #0x0]\n" - "str h21, [x25, #0x0]\n" - "b 198f\n" - "194:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x16, #2, 196f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x16, #1, 195f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v8.h }[6], [x13]\n" - "st1 { v12.h }[6], [x9]\n" - "st1 { v16.h }[6], [x27]\n" - "st1 { v20.h }[6], [x25]\n" - "b 198f\n" - "195:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x16, #0, 198f\n" - "st1 { v8.h }[4], [x13]\n" - "st1 { v12.h }[4], [x9]\n" - "st1 { v16.h }[4], [x27]\n" - "st1 { v20.h }[4], [x25]\n" - "b 198f\n" - "196:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x16, #1, 197f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "str s20, [x25], #0x4\n" - "tbz x16, #0, 198f\n" - "st1 { v8.h }[2], [x13]\n" - "st1 { v12.h }[2], [x9]\n" - "st1 { v16.h }[2], [x27]\n" - "st1 { v20.h }[2], [x25]\n" - "b 198f\n" - "197:" // Height 4: Partial direct writeback: partial_1_0 - "str h8, [x13, #0x0]\n" - "str h12, [x9, #0x0]\n" - "str h16, [x27, #0x0]\n" - "str h20, [x25, #0x0]\n" - "198:" // Height 4: Partial direct writeback: Done - "b 200f\n" - "199:" // Height 4: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "200:" // Height 4: Writeback done - "subs x16, x16, #0x20\n" - "bgt 153b\n" - "b 302f\n" - "201:" // Height 5 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 202f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #1\n" - "add x25, x25, x19, LSL #1\n" - "add x23, x23, x19, LSL #1\n" - "b 203f\n" - "202:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "add x25, x27, x19, LSL #1\n" - "add x23, x25, x19, LSL #1\n" - "203:" // Height 5: Column loop - "cbz x14, 204f\n" - "ldr q8, [x14, #0x0]\n" + "178:" // Height 4: No activation + "cmp x11, #0x20\n" + "bge 195f\n" + "tbz x11, #4, 186f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v9.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v13.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v17.8h }, [x23], #0x10\n" + "st1 { v20.8h }, [x22], #0x10\n" + "st1 { v21.8h }, [x22], #0x10\n" + "tbz x11, #3, 182f\n" + "st1 { v10.8h }, [x28], #0x10\n" + "st1 { v14.8h }, [x24], #0x10\n" + "st1 { v18.8h }, [x23], #0x10\n" + "st1 { v22.8h }, [x22], #0x10\n" + "tbz x11, #2, 180f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "tbz x11, #1, 179f\n" + "st1 { v11.s }[2], [x28], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v19.s }[2], [x23], #0x4\n" + "st1 { v23.s }[2], [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v11.h }[6], [x28]\n" + "st1 { v15.h }[6], [x24]\n" + "st1 { v19.h }[6], [x23]\n" + "st1 { v23.h }[6], [x22]\n" + "b 194f\n" + "179:" // Height 4: Partial direct writeback: partial_1_28 + "tbz x11, #0, 194f\n" + "st1 { v11.h }[4], [x28]\n" + "st1 { v15.h }[4], [x24]\n" + "st1 { v19.h }[4], [x23]\n" + "st1 { v23.h }[4], [x22]\n" + "b 194f\n" + "180:" // Height 4: Partial direct writeback: partial_2_24 + "tbz x11, #1, 181f\n" + "str s11, [x28], #0x4\n" + "str s15, [x24], #0x4\n" + "str s19, [x23], #0x4\n" + "str s23, [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v11.h }[2], [x28]\n" + "st1 { v15.h }[2], [x24]\n" + "st1 { v19.h }[2], [x23]\n" + "st1 { v23.h }[2], [x22]\n" + "b 194f\n" + "181:" // Height 4: Partial direct writeback: partial_1_24 + "tbz x11, #0, 194f\n" + "str h11, [x28, #0x0]\n" + "str h15, [x24, #0x0]\n" + "str h19, [x23, #0x0]\n" + "str h23, [x22, #0x0]\n" + "b 194f\n" + "182:" // Height 4: Partial direct writeback: partial_4_16 + "tbz x11, #2, 184f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "tbz x11, #1, 183f\n" + "st1 { v10.s }[2], [x28], #0x4\n" + "st1 { v14.s }[2], [x24], #0x4\n" + "st1 { v18.s }[2], [x23], #0x4\n" + "st1 { v22.s }[2], [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v10.h }[6], [x28]\n" + "st1 { v14.h }[6], [x24]\n" + "st1 { v18.h }[6], [x23]\n" + "st1 { v22.h }[6], [x22]\n" + "b 194f\n" + "183:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x11, #0, 194f\n" + "st1 { v10.h }[4], [x28]\n" + "st1 { v14.h }[4], [x24]\n" + "st1 { v18.h }[4], [x23]\n" + "st1 { v22.h }[4], [x22]\n" + "b 194f\n" + "184:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x11, #1, 185f\n" + "str s10, [x28], #0x4\n" + "str s14, [x24], #0x4\n" + "str s18, [x23], #0x4\n" + "str s22, [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v10.h }[2], [x28]\n" + "st1 { v14.h }[2], [x24]\n" + "st1 { v18.h }[2], [x23]\n" + "st1 { v22.h }[2], [x22]\n" + "b 194f\n" + "185:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x11, #0, 194f\n" + "str h10, [x28, #0x0]\n" + "str h14, [x24, #0x0]\n" + "str h18, [x23, #0x0]\n" + "str h22, [x22, #0x0]\n" + "b 194f\n" + "186:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x11, #3, 190f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v20.8h }, [x22], #0x10\n" + "tbz x11, #2, 188f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "tbz x11, #1, 187f\n" + "st1 { v9.s }[2], [x28], #0x4\n" + "st1 { v13.s }[2], [x24], #0x4\n" + "st1 { v17.s }[2], [x23], #0x4\n" + "st1 { v21.s }[2], [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v9.h }[6], [x28]\n" + "st1 { v13.h }[6], [x24]\n" + "st1 { v17.h }[6], [x23]\n" + "st1 { v21.h }[6], [x22]\n" + "b 194f\n" + "187:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 194f\n" + "st1 { v9.h }[4], [x28]\n" + "st1 { v13.h }[4], [x24]\n" + "st1 { v17.h }[4], [x23]\n" + "st1 { v21.h }[4], [x22]\n" + "b 194f\n" + "188:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 189f\n" + "str s9, [x28], #0x4\n" + "str s13, [x24], #0x4\n" + "str s17, [x23], #0x4\n" + "str s21, [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v9.h }[2], [x28]\n" + "st1 { v13.h }[2], [x24]\n" + "st1 { v17.h }[2], [x23]\n" + "st1 { v21.h }[2], [x22]\n" + "b 194f\n" + "189:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 194f\n" + "str h9, [x28, #0x0]\n" + "str h13, [x24, #0x0]\n" + "str h17, [x23, #0x0]\n" + "str h21, [x22, #0x0]\n" + "b 194f\n" + "190:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 192f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "tbz x11, #1, 191f\n" + "st1 { v8.s }[2], [x28], #0x4\n" + "st1 { v12.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v20.s }[2], [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v8.h }[6], [x28]\n" + "st1 { v12.h }[6], [x24]\n" + "st1 { v16.h }[6], [x23]\n" + "st1 { v20.h }[6], [x22]\n" + "b 194f\n" + "191:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 194f\n" + "st1 { v8.h }[4], [x28]\n" + "st1 { v12.h }[4], [x24]\n" + "st1 { v16.h }[4], [x23]\n" + "st1 { v20.h }[4], [x22]\n" + "b 194f\n" + "192:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 193f\n" + "str s8, [x28], #0x4\n" + "str s12, [x24], #0x4\n" + "str s16, [x23], #0x4\n" + "str s20, [x22], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v8.h }[2], [x28]\n" + "st1 { v12.h }[2], [x24]\n" + "st1 { v16.h }[2], [x23]\n" + "st1 { v20.h }[2], [x22]\n" + "b 194f\n" + "193:" // Height 4: Partial direct writeback: partial_1_0 + "str h8, [x28, #0x0]\n" + "str h12, [x24, #0x0]\n" + "str h16, [x23, #0x0]\n" + "str h20, [x22, #0x0]\n" + "194:" // Height 4: Partial direct writeback: Done + "b 196f\n" + "195:" // Height 4: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "196:" // Height 4: Writeback done + "subs x11, x11, #0x20\n" + "bgt 149b\n" + "b 296f\n" + "197:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "198:" // Height 5: Column loop + "cbz x9, 199f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v24.16b, v8.16b\n" - "add x14, x14, #0x40\n" "mov v13.16b, v9.16b\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" @@ -2976,277 +2923,278 @@ void a64_hybrid_fp16_mla_6x32 ( "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" "mov v27.16b, v11.16b\n" - "b 223f\n" - "204:" // Height 5: no bias - "tbz %x[flags], #0, 222f\n" - "cmp x16, #0x20\n" - "bge 221f\n" - "tbz x16, #4, 212f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v20.8h }, [x25], #0x10\n" - "ld1 { v24.8h }, [x23], #0x10\n" - "ld1 { v9.8h }, [x13], #0x10\n" - "ld1 { v13.8h }, [x9], #0x10\n" - "ld1 { v17.8h }, [x27], #0x10\n" - "ld1 { v21.8h }, [x25], #0x10\n" - "ld1 { v25.8h }, [x23], #0x10\n" - "tbz x16, #3, 208f\n" - "ld1 { v10.8h }, [x13], #0x10\n" - "ld1 { v14.8h }, [x9], #0x10\n" - "ld1 { v18.8h }, [x27], #0x10\n" - "ld1 { v22.8h }, [x25], #0x10\n" - "ld1 { v26.8h }, [x23], #0x10\n" - "tbz x16, #2, 206f\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "tbz x16, #1, 205f\n" - "ld1 { v11.s }[2], [x13], #0x4\n" - "ld1 { v15.s }[2], [x9], #0x4\n" - "ld1 { v19.s }[2], [x27], #0x4\n" - "ld1 { v23.s }[2], [x25], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" + "b 218f\n" + "199:" // Height 5: no bias + "tbz %x[flags], #0, 217f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "bge 216f\n" + "tbz x11, #4, 207f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v20.8h }, [x22], #0x10\n" + "ld1 { v24.8h }, [x21], #0x10\n" + "ld1 { v9.8h }, [x28], #0x10\n" + "ld1 { v13.8h }, [x24], #0x10\n" + "ld1 { v17.8h }, [x23], #0x10\n" + "ld1 { v21.8h }, [x22], #0x10\n" + "ld1 { v25.8h }, [x21], #0x10\n" + "tbz x11, #3, 203f\n" + "ld1 { v10.8h }, [x28], #0x10\n" + "ld1 { v14.8h }, [x24], #0x10\n" + "ld1 { v18.8h }, [x23], #0x10\n" + "ld1 { v22.8h }, [x22], #0x10\n" + "ld1 { v26.8h }, [x21], #0x10\n" + "tbz x11, #2, 201f\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "tbz x11, #1, 200f\n" + "ld1 { v11.s }[2], [x28], #0x4\n" "mov x19, #0x3c\n" - "tbz x16, #0, 220f\n" - "ld1 { v11.h }[6], [x13]\n" - "ld1 { v15.h }[6], [x9]\n" - "ld1 { v19.h }[6], [x27]\n" - "ld1 { v23.h }[6], [x25]\n" - "ld1 { v27.h }[6], [x23]\n" - "b 220f\n" - "205:" // Height 5: Partial accumulate: partial_1_28 + "ld1 { v15.s }[2], [x24], #0x4\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x22], #0x4\n" + "ld1 { v27.s }[2], [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v11.h }[6], [x28]\n" + "ld1 { v15.h }[6], [x24]\n" + "ld1 { v19.h }[6], [x23]\n" + "ld1 { v23.h }[6], [x22]\n" + "ld1 { v27.h }[6], [x21]\n" + "b 215f\n" + "200:" // Height 5: Partial accumulate: partial_1_28 "mov x19, #0x38\n" - "tbz x16, #0, 220f\n" - "ld1 { v11.h }[4], [x13]\n" - "ld1 { v15.h }[4], [x9]\n" - "ld1 { v19.h }[4], [x27]\n" - "ld1 { v23.h }[4], [x25]\n" - "ld1 { v27.h }[4], [x23]\n" - "b 220f\n" - "206:" // Height 5: Partial accumulate: partial_2_24 - "tbz x16, #1, 207f\n" - "ldr s11, [x13], #0x4\n" - "ldr s15, [x9], #0x4\n" - "ldr s19, [x27], #0x4\n" - "ldr s23, [x25], #0x4\n" - "ldr s27, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v11.h }[4], [x28]\n" + "ld1 { v15.h }[4], [x24]\n" + "ld1 { v19.h }[4], [x23]\n" + "ld1 { v23.h }[4], [x22]\n" + "ld1 { v27.h }[4], [x21]\n" + "b 215f\n" + "201:" // Height 5: Partial accumulate: partial_2_24 + "tbz x11, #1, 202f\n" + "ldr s11, [x28], #0x4\n" + "ldr s15, [x24], #0x4\n" "mov x19, #0x34\n" - "tbz x16, #0, 220f\n" - "ld1 { v11.h }[2], [x13]\n" - "ld1 { v15.h }[2], [x9]\n" - "ld1 { v19.h }[2], [x27]\n" - "ld1 { v23.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x23]\n" - "b 220f\n" - "207:" // Height 5: Partial accumulate: partial_1_24 + "ldr s19, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s27, [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v11.h }[2], [x28]\n" + "ld1 { v15.h }[2], [x24]\n" + "ld1 { v19.h }[2], [x23]\n" + "ld1 { v23.h }[2], [x22]\n" + "ld1 { v27.h }[2], [x21]\n" + "b 215f\n" + "202:" // Height 5: Partial accumulate: partial_1_24 "mov x19, #0x30\n" - "tbz x16, #0, 220f\n" - "ldr h11, [x13, #0x0]\n" - "ldr h15, [x9, #0x0]\n" - "ldr h19, [x27, #0x0]\n" - "ldr h23, [x25, #0x0]\n" - "ldr h27, [x23, #0x0]\n" - "b 220f\n" - "208:" // Height 5: Partial accumulate: partial_4_16 - "tbz x16, #2, 210f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "tbz x16, #1, 209f\n" - "ld1 { v10.s }[2], [x13], #0x4\n" - "ld1 { v14.s }[2], [x9], #0x4\n" - "ld1 { v18.s }[2], [x27], #0x4\n" - "ld1 { v22.s }[2], [x25], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ldr h11, [x28, #0x0]\n" + "ldr h15, [x24, #0x0]\n" + "ldr h19, [x23, #0x0]\n" + "ldr h23, [x22, #0x0]\n" + "ldr h27, [x21, #0x0]\n" + "b 215f\n" + "203:" // Height 5: Partial accumulate: partial_4_16 + "tbz x11, #2, 205f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz x11, #1, 204f\n" + "ld1 { v10.s }[2], [x28], #0x4\n" "mov x19, #0x2c\n" - "tbz x16, #0, 220f\n" - "ld1 { v10.h }[6], [x13]\n" - "ld1 { v14.h }[6], [x9]\n" - "ld1 { v18.h }[6], [x27]\n" - "ld1 { v22.h }[6], [x25]\n" - "ld1 { v26.h }[6], [x23]\n" - "b 220f\n" - "209:" // Height 5: Partial accumulate: partial_1_20 + "ld1 { v14.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "ld1 { v22.s }[2], [x22], #0x4\n" + "ld1 { v26.s }[2], [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v10.h }[6], [x28]\n" + "ld1 { v14.h }[6], [x24]\n" + "ld1 { v18.h }[6], [x23]\n" + "ld1 { v22.h }[6], [x22]\n" + "ld1 { v26.h }[6], [x21]\n" + "b 215f\n" + "204:" // Height 5: Partial accumulate: partial_1_20 "mov x19, #0x28\n" - "tbz x16, #0, 220f\n" - "ld1 { v10.h }[4], [x13]\n" - "ld1 { v14.h }[4], [x9]\n" - "ld1 { v18.h }[4], [x27]\n" - "ld1 { v22.h }[4], [x25]\n" - "ld1 { v26.h }[4], [x23]\n" - "b 220f\n" - "210:" // Height 5: Partial accumulate: partial_2_16 - "tbz x16, #1, 211f\n" - "ldr s10, [x13], #0x4\n" - "ldr s14, [x9], #0x4\n" - "ldr s18, [x27], #0x4\n" - "ldr s22, [x25], #0x4\n" - "ldr s26, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v10.h }[4], [x28]\n" + "ld1 { v14.h }[4], [x24]\n" + "ld1 { v18.h }[4], [x23]\n" + "ld1 { v22.h }[4], [x22]\n" + "ld1 { v26.h }[4], [x21]\n" + "b 215f\n" + "205:" // Height 5: Partial accumulate: partial_2_16 + "tbz x11, #1, 206f\n" + "ldr s10, [x28], #0x4\n" + "ldr s14, [x24], #0x4\n" "mov x19, #0x24\n" - "tbz x16, #0, 220f\n" - "ld1 { v10.h }[2], [x13]\n" - "ld1 { v14.h }[2], [x9]\n" - "ld1 { v18.h }[2], [x27]\n" - "ld1 { v22.h }[2], [x25]\n" - "ld1 { v26.h }[2], [x23]\n" - "b 220f\n" - "211:" // Height 5: Partial accumulate: partial_1_16 + "ldr s18, [x23], #0x4\n" + "ldr s22, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v10.h }[2], [x28]\n" + "ld1 { v14.h }[2], [x24]\n" + "ld1 { v18.h }[2], [x23]\n" + "ld1 { v22.h }[2], [x22]\n" + "ld1 { v26.h }[2], [x21]\n" + "b 215f\n" + "206:" // Height 5: Partial accumulate: partial_1_16 "mov x19, #0x20\n" - "tbz x16, #0, 220f\n" - "ldr h10, [x13, #0x0]\n" - "ldr h14, [x9, #0x0]\n" - "ldr h18, [x27, #0x0]\n" - "ldr h22, [x25, #0x0]\n" - "ldr h26, [x23, #0x0]\n" - "b 220f\n" - "212:" // Height 5: Partial accumulate: partial_8_0 - "tbz x16, #3, 216f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v20.8h }, [x25], #0x10\n" - "ld1 { v24.8h }, [x23], #0x10\n" - "tbz x16, #2, 214f\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "tbz x16, #1, 213f\n" - "ld1 { v9.s }[2], [x13], #0x4\n" - "ld1 { v13.s }[2], [x9], #0x4\n" - "ld1 { v17.s }[2], [x27], #0x4\n" - "ld1 { v21.s }[2], [x25], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ldr h10, [x28, #0x0]\n" + "ldr h14, [x24, #0x0]\n" + "ldr h18, [x23, #0x0]\n" + "ldr h22, [x22, #0x0]\n" + "ldr h26, [x21, #0x0]\n" + "b 215f\n" + "207:" // Height 5: Partial accumulate: partial_8_0 + "tbz x11, #3, 211f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v20.8h }, [x22], #0x10\n" + "ld1 { v24.8h }, [x21], #0x10\n" + "tbz x11, #2, 209f\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "tbz x11, #1, 208f\n" + "ld1 { v9.s }[2], [x28], #0x4\n" "mov x19, #0x1c\n" - "tbz x16, #0, 220f\n" - "ld1 { v9.h }[6], [x13]\n" - "ld1 { v13.h }[6], [x9]\n" - "ld1 { v17.h }[6], [x27]\n" - "ld1 { v21.h }[6], [x25]\n" - "ld1 { v25.h }[6], [x23]\n" - "b 220f\n" - "213:" // Height 5: Partial accumulate: partial_1_12 + "ld1 { v13.s }[2], [x24], #0x4\n" + "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v25.s }[2], [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v9.h }[6], [x28]\n" + "ld1 { v13.h }[6], [x24]\n" + "ld1 { v17.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "ld1 { v25.h }[6], [x21]\n" + "b 215f\n" + "208:" // Height 5: Partial accumulate: partial_1_12 "mov x19, #0x18\n" - "tbz x16, #0, 220f\n" - "ld1 { v9.h }[4], [x13]\n" - "ld1 { v13.h }[4], [x9]\n" - "ld1 { v17.h }[4], [x27]\n" - "ld1 { v21.h }[4], [x25]\n" - "ld1 { v25.h }[4], [x23]\n" - "b 220f\n" - "214:" // Height 5: Partial accumulate: partial_2_8 - "tbz x16, #1, 215f\n" - "ldr s9, [x13], #0x4\n" - "ldr s13, [x9], #0x4\n" - "ldr s17, [x27], #0x4\n" - "ldr s21, [x25], #0x4\n" - "ldr s25, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v9.h }[4], [x28]\n" + "ld1 { v13.h }[4], [x24]\n" + "ld1 { v17.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "ld1 { v25.h }[4], [x21]\n" + "b 215f\n" + "209:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 210f\n" + "ldr s9, [x28], #0x4\n" + "ldr s13, [x24], #0x4\n" "mov x19, #0x14\n" - "tbz x16, #0, 220f\n" - "ld1 { v9.h }[2], [x13]\n" - "ld1 { v13.h }[2], [x9]\n" - "ld1 { v17.h }[2], [x27]\n" - "ld1 { v21.h }[2], [x25]\n" - "ld1 { v25.h }[2], [x23]\n" - "b 220f\n" - "215:" // Height 5: Partial accumulate: partial_1_8 + "ldr s17, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s25, [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v9.h }[2], [x28]\n" + "ld1 { v13.h }[2], [x24]\n" + "ld1 { v17.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "ld1 { v25.h }[2], [x21]\n" + "b 215f\n" + "210:" // Height 5: Partial accumulate: partial_1_8 "mov x19, #0x10\n" - "tbz x16, #0, 220f\n" - "ldr h9, [x13, #0x0]\n" - "ldr h13, [x9, #0x0]\n" - "ldr h17, [x27, #0x0]\n" - "ldr h21, [x25, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "b 220f\n" - "216:" // Height 5: Partial accumulate: partial_4_0 - "tbz x16, #2, 218f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "tbz x16, #1, 217f\n" - "ld1 { v8.s }[2], [x13], #0x4\n" - "ld1 { v12.s }[2], [x9], #0x4\n" - "ld1 { v16.s }[2], [x27], #0x4\n" - "ld1 { v20.s }[2], [x25], #0x4\n" - "ld1 { v24.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ldr h9, [x28, #0x0]\n" + "ldr h13, [x24, #0x0]\n" + "ldr h17, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "ldr h25, [x21, #0x0]\n" + "b 215f\n" + "211:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 213f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "tbz x11, #1, 212f\n" + "ld1 { v8.s }[2], [x28], #0x4\n" "mov x19, #0xc\n" - "tbz x16, #0, 220f\n" - "ld1 { v8.h }[6], [x13]\n" - "ld1 { v12.h }[6], [x9]\n" - "ld1 { v16.h }[6], [x27]\n" - "ld1 { v20.h }[6], [x25]\n" - "ld1 { v24.h }[6], [x23]\n" - "b 220f\n" - "217:" // Height 5: Partial accumulate: partial_1_4 + "ld1 { v12.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "ld1 { v24.s }[2], [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[6], [x28]\n" + "ld1 { v12.h }[6], [x24]\n" + "ld1 { v16.h }[6], [x23]\n" + "ld1 { v20.h }[6], [x22]\n" + "ld1 { v24.h }[6], [x21]\n" + "b 215f\n" + "212:" // Height 5: Partial accumulate: partial_1_4 "mov x19, #0x8\n" - "tbz x16, #0, 220f\n" - "ld1 { v8.h }[4], [x13]\n" - "ld1 { v12.h }[4], [x9]\n" - "ld1 { v16.h }[4], [x27]\n" - "ld1 { v20.h }[4], [x25]\n" - "ld1 { v24.h }[4], [x23]\n" - "b 220f\n" - "218:" // Height 5: Partial accumulate: partial_2_0 - "tbz x16, #1, 219f\n" - "ldr s8, [x13], #0x4\n" - "ldr s12, [x9], #0x4\n" - "ldr s16, [x27], #0x4\n" - "ldr s20, [x25], #0x4\n" - "ldr s24, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[4], [x28]\n" + "ld1 { v12.h }[4], [x24]\n" + "ld1 { v16.h }[4], [x23]\n" + "ld1 { v20.h }[4], [x22]\n" + "ld1 { v24.h }[4], [x21]\n" + "b 215f\n" + "213:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 214f\n" + "ldr s8, [x28], #0x4\n" + "ldr s12, [x24], #0x4\n" "mov x19, #0x4\n" - "tbz x16, #0, 220f\n" - "ld1 { v8.h }[2], [x13]\n" - "ld1 { v12.h }[2], [x9]\n" - "ld1 { v16.h }[2], [x27]\n" - "ld1 { v20.h }[2], [x25]\n" - "ld1 { v24.h }[2], [x23]\n" - "b 220f\n" - "219:" // Height 5: Partial accumulate: partial_1_0 + "ldr s16, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[2], [x28]\n" + "ld1 { v12.h }[2], [x24]\n" + "ld1 { v16.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v24.h }[2], [x21]\n" + "b 215f\n" + "214:" // Height 5: Partial accumulate: partial_1_0 + "ldr h8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr h8, [x13, #0x0]\n" - "ldr h12, [x9, #0x0]\n" - "ldr h16, [x27, #0x0]\n" - "ldr h20, [x25, #0x0]\n" - "ldr h24, [x23, #0x0]\n" - "220:" // Height 5: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "b 223f\n" - "221:" // Height 5: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "b 223f\n" - "222:" // Height 5: no accumulate + "ldr h12, [x24, #0x0]\n" + "ldr h16, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h24, [x21, #0x0]\n" + "215:" // Height 5: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 218f\n" + "216:" // Height 5: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "b 218f\n" + "217:" // Height 5: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -3267,452 +3215,452 @@ void a64_hybrid_fp16_mla_6x32 ( "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" - "223:" // Height 5: setup done - "mov x12, #0x0\n" - "224:" // Height 5: String loop + "218:" // Height 5: setup done + "mov x27, #0x0\n" + "219:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 225f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 220f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 226f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 221f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" - "b 226f\n" - "225:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "226:" // Height 5: input setup done - "cmp x11, #0x8\n" - "blt 229f\n" - "cmp x11, #0x10\n" - "blt 228f\n" - "227:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x21, x21, x19, LSL #1\n" + "b 221f\n" + "220:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "221:" // Height 5: input setup done + "cmp x26, #0x8\n" + "blt 224f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 223f\n" + "222:" // Height 5: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x26, x26, #0x8\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x26, #0x10\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x11, x11, #0x8\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "cmp x11, #0x10\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "fmla v20.8h, v6.8h, v3.h[1]\n" "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" - "add x15, x15, #0x200\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v22.8h, v6.8h, v3.h[7]\n" "fmla v26.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x23, #0x0]\n" "fmla v23.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x22, #0x0]\n" "fmla v27.8h, v7.8h, v4.h[7]\n" - "bge 227b\n" - "228:" // Height 5: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "bge 222b\n" + "223:" // Height 5: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x22, x22, #0x10\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "fmla v20.8h, v6.8h, v3.h[1]\n" "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" - "add x15, x15, #0x200\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v22.8h, v6.8h, v3.h[7]\n" @@ -3722,31 +3670,31 @@ void a64_hybrid_fp16_mla_6x32 ( "fmla v19.8h, v7.8h, v2.h[7]\n" "fmla v23.8h, v7.8h, v3.h[7]\n" "fmla v27.8h, v7.8h, v4.h[7]\n" - "229:" // Height 5: Multiply loop: Main loop skip - "cbz x11, 231f\n" - "230:" // Height 5: Multiply loop: Odd block loop - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" - "ldr q6, [x15, #0x0]\n" + "224:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 226f\n" + "225:" // Height 5: Multiply loop: Odd block loop + "ldr h0, [x25], #0x2\n" + "sub x26, x26, #0x1\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" @@ -3756,18 +3704,23 @@ void a64_hybrid_fp16_mla_6x32 ( "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" - "cbnz x11, 230b\n" - "231:" // Height 5: Multiply loop: No odd multiplies + "cbnz x26, 225b\n" + "226:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 224b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 219b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" "prfm pstl1keep, [x23, #0x0]\n" - "tbz %x[flags], #1, 232f\n" + "add x22, x23, x19, LSL #1\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #1\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 227f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.8h }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -3812,305 +3765,281 @@ void a64_hybrid_fp16_mla_6x32 ( "fmax v26.8h, v26.8h, v1.8h\n" "fmin v27.8h, v27.8h, v0.8h\n" "fmax v27.8h, v27.8h, v1.8h\n" - "232:" // Height 5: No activation - "cmp x16, #0x20\n" - "bge 249f\n" - "tbz x16, #4, 240f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v9.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v13.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v17.8h }, [x27], #0x10\n" - "st1 { v20.8h }, [x25], #0x10\n" - "st1 { v21.8h }, [x25], #0x10\n" - "st1 { v24.8h }, [x23], #0x10\n" - "st1 { v25.8h }, [x23], #0x10\n" - "tbz x16, #3, 236f\n" - "st1 { v10.8h }, [x13], #0x10\n" - "st1 { v14.8h }, [x9], #0x10\n" - "st1 { v18.8h }, [x27], #0x10\n" - "st1 { v22.8h }, [x25], #0x10\n" - "st1 { v26.8h }, [x23], #0x10\n" - "tbz x16, #2, 234f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "tbz x16, #1, 233f\n" - "st1 { v11.s }[2], [x13], #0x4\n" - "st1 { v15.s }[2], [x9], #0x4\n" - "st1 { v19.s }[2], [x27], #0x4\n" - "st1 { v23.s }[2], [x25], #0x4\n" - "st1 { v27.s }[2], [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v11.h }[6], [x13]\n" - "st1 { v15.h }[6], [x9]\n" - "st1 { v19.h }[6], [x27]\n" - "st1 { v23.h }[6], [x25]\n" - "st1 { v27.h }[6], [x23]\n" - "b 248f\n" - "233:" // Height 5: Partial direct writeback: partial_1_28 - "tbz x16, #0, 248f\n" - "st1 { v11.h }[4], [x13]\n" - "st1 { v15.h }[4], [x9]\n" - "st1 { v19.h }[4], [x27]\n" - "st1 { v23.h }[4], [x25]\n" - "st1 { v27.h }[4], [x23]\n" - "b 248f\n" - "234:" // Height 5: Partial direct writeback: partial_2_24 - "tbz x16, #1, 235f\n" - "str s11, [x13], #0x4\n" - "str s15, [x9], #0x4\n" - "str s19, [x27], #0x4\n" - "str s23, [x25], #0x4\n" - "str s27, [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v11.h }[2], [x13]\n" - "st1 { v15.h }[2], [x9]\n" - "st1 { v19.h }[2], [x27]\n" - "st1 { v23.h }[2], [x25]\n" - "st1 { v27.h }[2], [x23]\n" - "b 248f\n" - "235:" // Height 5: Partial direct writeback: partial_1_24 - "tbz x16, #0, 248f\n" - "str h11, [x13, #0x0]\n" - "str h15, [x9, #0x0]\n" - "str h19, [x27, #0x0]\n" - "str h23, [x25, #0x0]\n" - "str h27, [x23, #0x0]\n" - "b 248f\n" - "236:" // Height 5: Partial direct writeback: partial_4_16 - "tbz x16, #2, 238f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "tbz x16, #1, 237f\n" - "st1 { v10.s }[2], [x13], #0x4\n" - "st1 { v14.s }[2], [x9], #0x4\n" - "st1 { v18.s }[2], [x27], #0x4\n" - "st1 { v22.s }[2], [x25], #0x4\n" - "st1 { v26.s }[2], [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v10.h }[6], [x13]\n" - "st1 { v14.h }[6], [x9]\n" - "st1 { v18.h }[6], [x27]\n" - "st1 { v22.h }[6], [x25]\n" - "st1 { v26.h }[6], [x23]\n" - "b 248f\n" - "237:" // Height 5: Partial direct writeback: partial_1_20 - "tbz x16, #0, 248f\n" - "st1 { v10.h }[4], [x13]\n" - "st1 { v14.h }[4], [x9]\n" - "st1 { v18.h }[4], [x27]\n" - "st1 { v22.h }[4], [x25]\n" - "st1 { v26.h }[4], [x23]\n" - "b 248f\n" - "238:" // Height 5: Partial direct writeback: partial_2_16 - "tbz x16, #1, 239f\n" - "str s10, [x13], #0x4\n" - "str s14, [x9], #0x4\n" - "str s18, [x27], #0x4\n" - "str s22, [x25], #0x4\n" - "str s26, [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v10.h }[2], [x13]\n" - "st1 { v14.h }[2], [x9]\n" - "st1 { v18.h }[2], [x27]\n" - "st1 { v22.h }[2], [x25]\n" - "st1 { v26.h }[2], [x23]\n" - "b 248f\n" - "239:" // Height 5: Partial direct writeback: partial_1_16 - "tbz x16, #0, 248f\n" - "str h10, [x13, #0x0]\n" - "str h14, [x9, #0x0]\n" - "str h18, [x27, #0x0]\n" - "str h22, [x25, #0x0]\n" - "str h26, [x23, #0x0]\n" - "b 248f\n" - "240:" // Height 5: Partial direct writeback: partial_8_0 - "tbz x16, #3, 244f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v20.8h }, [x25], #0x10\n" - "st1 { v24.8h }, [x23], #0x10\n" - "tbz x16, #2, 242f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "tbz x16, #1, 241f\n" - "st1 { v9.s }[2], [x13], #0x4\n" - "st1 { v13.s }[2], [x9], #0x4\n" - "st1 { v17.s }[2], [x27], #0x4\n" - "st1 { v21.s }[2], [x25], #0x4\n" - "st1 { v25.s }[2], [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v9.h }[6], [x13]\n" - "st1 { v13.h }[6], [x9]\n" - "st1 { v17.h }[6], [x27]\n" - "st1 { v21.h }[6], [x25]\n" - "st1 { v25.h }[6], [x23]\n" - "b 248f\n" - "241:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x16, #0, 248f\n" - "st1 { v9.h }[4], [x13]\n" - "st1 { v13.h }[4], [x9]\n" - "st1 { v17.h }[4], [x27]\n" - "st1 { v21.h }[4], [x25]\n" - "st1 { v25.h }[4], [x23]\n" - "b 248f\n" - "242:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x16, #1, 243f\n" - "str s9, [x13], #0x4\n" - "str s13, [x9], #0x4\n" - "str s17, [x27], #0x4\n" - "str s21, [x25], #0x4\n" - "str s25, [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v9.h }[2], [x13]\n" - "st1 { v13.h }[2], [x9]\n" - "st1 { v17.h }[2], [x27]\n" - "st1 { v21.h }[2], [x25]\n" - "st1 { v25.h }[2], [x23]\n" - "b 248f\n" - "243:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x16, #0, 248f\n" - "str h9, [x13, #0x0]\n" - "str h13, [x9, #0x0]\n" - "str h17, [x27, #0x0]\n" - "str h21, [x25, #0x0]\n" - "str h25, [x23, #0x0]\n" - "b 248f\n" - "244:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x16, #2, 246f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x16, #1, 245f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v8.h }[6], [x13]\n" - "st1 { v12.h }[6], [x9]\n" - "st1 { v16.h }[6], [x27]\n" - "st1 { v20.h }[6], [x25]\n" - "st1 { v24.h }[6], [x23]\n" - "b 248f\n" - "245:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x16, #0, 248f\n" - "st1 { v8.h }[4], [x13]\n" - "st1 { v12.h }[4], [x9]\n" - "st1 { v16.h }[4], [x27]\n" - "st1 { v20.h }[4], [x25]\n" - "st1 { v24.h }[4], [x23]\n" - "b 248f\n" - "246:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x16, #1, 247f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x16, #0, 248f\n" - "st1 { v8.h }[2], [x13]\n" - "st1 { v12.h }[2], [x9]\n" - "st1 { v16.h }[2], [x27]\n" - "st1 { v20.h }[2], [x25]\n" - "st1 { v24.h }[2], [x23]\n" - "b 248f\n" - "247:" // Height 5: Partial direct writeback: partial_1_0 - "str h8, [x13, #0x0]\n" - "str h12, [x9, #0x0]\n" - "str h16, [x27, #0x0]\n" - "str h20, [x25, #0x0]\n" - "str h24, [x23, #0x0]\n" - "248:" // Height 5: Partial direct writeback: Done - "b 250f\n" - "249:" // Height 5: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "250:" // Height 5: Writeback done - "subs x16, x16, #0x20\n" - "bgt 203b\n" - "b 302f\n" - "251:" // Height 6 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 252f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #1\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #1\n" - "add x23, x23, x19, LSL #1\n" - "add x21, x21, x19, LSL #1\n" - "b 253f\n" - "252:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "add x25, x27, x19, LSL #1\n" - "add x23, x25, x19, LSL #1\n" - "add x21, x23, x19, LSL #1\n" - "add %x[output_ptr], x21, x19, LSL #1\n" - "253:" // Height 6: Column loop - "cbz x14, 254f\n" - "ldr q8, [x14, #0x0]\n" + "227:" // Height 5: No activation + "cmp x11, #0x20\n" + "bge 244f\n" + "tbz x11, #4, 235f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v9.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v13.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v17.8h }, [x23], #0x10\n" + "st1 { v20.8h }, [x22], #0x10\n" + "st1 { v21.8h }, [x22], #0x10\n" + "st1 { v24.8h }, [x21], #0x10\n" + "st1 { v25.8h }, [x21], #0x10\n" + "tbz x11, #3, 231f\n" + "st1 { v10.8h }, [x28], #0x10\n" + "st1 { v14.8h }, [x24], #0x10\n" + "st1 { v18.8h }, [x23], #0x10\n" + "st1 { v22.8h }, [x22], #0x10\n" + "st1 { v26.8h }, [x21], #0x10\n" + "tbz x11, #2, 229f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "tbz x11, #1, 228f\n" + "st1 { v11.s }[2], [x28], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v19.s }[2], [x23], #0x4\n" + "st1 { v23.s }[2], [x22], #0x4\n" + "st1 { v27.s }[2], [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v11.h }[6], [x28]\n" + "st1 { v15.h }[6], [x24]\n" + "st1 { v19.h }[6], [x23]\n" + "st1 { v23.h }[6], [x22]\n" + "st1 { v27.h }[6], [x21]\n" + "b 243f\n" + "228:" // Height 5: Partial direct writeback: partial_1_28 + "tbz x11, #0, 243f\n" + "st1 { v11.h }[4], [x28]\n" + "st1 { v15.h }[4], [x24]\n" + "st1 { v19.h }[4], [x23]\n" + "st1 { v23.h }[4], [x22]\n" + "st1 { v27.h }[4], [x21]\n" + "b 243f\n" + "229:" // Height 5: Partial direct writeback: partial_2_24 + "tbz x11, #1, 230f\n" + "str s11, [x28], #0x4\n" + "str s15, [x24], #0x4\n" + "str s19, [x23], #0x4\n" + "str s23, [x22], #0x4\n" + "str s27, [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v11.h }[2], [x28]\n" + "st1 { v15.h }[2], [x24]\n" + "st1 { v19.h }[2], [x23]\n" + "st1 { v23.h }[2], [x22]\n" + "st1 { v27.h }[2], [x21]\n" + "b 243f\n" + "230:" // Height 5: Partial direct writeback: partial_1_24 + "tbz x11, #0, 243f\n" + "str h11, [x28, #0x0]\n" + "str h15, [x24, #0x0]\n" + "str h19, [x23, #0x0]\n" + "str h23, [x22, #0x0]\n" + "str h27, [x21, #0x0]\n" + "b 243f\n" + "231:" // Height 5: Partial direct writeback: partial_4_16 + "tbz x11, #2, 233f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "tbz x11, #1, 232f\n" + "st1 { v10.s }[2], [x28], #0x4\n" + "st1 { v14.s }[2], [x24], #0x4\n" + "st1 { v18.s }[2], [x23], #0x4\n" + "st1 { v22.s }[2], [x22], #0x4\n" + "st1 { v26.s }[2], [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v10.h }[6], [x28]\n" + "st1 { v14.h }[6], [x24]\n" + "st1 { v18.h }[6], [x23]\n" + "st1 { v22.h }[6], [x22]\n" + "st1 { v26.h }[6], [x21]\n" + "b 243f\n" + "232:" // Height 5: Partial direct writeback: partial_1_20 + "tbz x11, #0, 243f\n" + "st1 { v10.h }[4], [x28]\n" + "st1 { v14.h }[4], [x24]\n" + "st1 { v18.h }[4], [x23]\n" + "st1 { v22.h }[4], [x22]\n" + "st1 { v26.h }[4], [x21]\n" + "b 243f\n" + "233:" // Height 5: Partial direct writeback: partial_2_16 + "tbz x11, #1, 234f\n" + "str s10, [x28], #0x4\n" + "str s14, [x24], #0x4\n" + "str s18, [x23], #0x4\n" + "str s22, [x22], #0x4\n" + "str s26, [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v10.h }[2], [x28]\n" + "st1 { v14.h }[2], [x24]\n" + "st1 { v18.h }[2], [x23]\n" + "st1 { v22.h }[2], [x22]\n" + "st1 { v26.h }[2], [x21]\n" + "b 243f\n" + "234:" // Height 5: Partial direct writeback: partial_1_16 + "tbz x11, #0, 243f\n" + "str h10, [x28, #0x0]\n" + "str h14, [x24, #0x0]\n" + "str h18, [x23, #0x0]\n" + "str h22, [x22, #0x0]\n" + "str h26, [x21, #0x0]\n" + "b 243f\n" + "235:" // Height 5: Partial direct writeback: partial_8_0 + "tbz x11, #3, 239f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v20.8h }, [x22], #0x10\n" + "st1 { v24.8h }, [x21], #0x10\n" + "tbz x11, #2, 237f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x11, #1, 236f\n" + "st1 { v9.s }[2], [x28], #0x4\n" + "st1 { v13.s }[2], [x24], #0x4\n" + "st1 { v17.s }[2], [x23], #0x4\n" + "st1 { v21.s }[2], [x22], #0x4\n" + "st1 { v25.s }[2], [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v9.h }[6], [x28]\n" + "st1 { v13.h }[6], [x24]\n" + "st1 { v17.h }[6], [x23]\n" + "st1 { v21.h }[6], [x22]\n" + "st1 { v25.h }[6], [x21]\n" + "b 243f\n" + "236:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 243f\n" + "st1 { v9.h }[4], [x28]\n" + "st1 { v13.h }[4], [x24]\n" + "st1 { v17.h }[4], [x23]\n" + "st1 { v21.h }[4], [x22]\n" + "st1 { v25.h }[4], [x21]\n" + "b 243f\n" + "237:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 238f\n" + "str s9, [x28], #0x4\n" + "str s13, [x24], #0x4\n" + "str s17, [x23], #0x4\n" + "str s21, [x22], #0x4\n" + "str s25, [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v9.h }[2], [x28]\n" + "st1 { v13.h }[2], [x24]\n" + "st1 { v17.h }[2], [x23]\n" + "st1 { v21.h }[2], [x22]\n" + "st1 { v25.h }[2], [x21]\n" + "b 243f\n" + "238:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 243f\n" + "str h9, [x28, #0x0]\n" + "str h13, [x24, #0x0]\n" + "str h17, [x23, #0x0]\n" + "str h21, [x22, #0x0]\n" + "str h25, [x21, #0x0]\n" + "b 243f\n" + "239:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 241f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x11, #1, 240f\n" + "st1 { v8.s }[2], [x28], #0x4\n" + "st1 { v12.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v20.s }[2], [x22], #0x4\n" + "st1 { v24.s }[2], [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v8.h }[6], [x28]\n" + "st1 { v12.h }[6], [x24]\n" + "st1 { v16.h }[6], [x23]\n" + "st1 { v20.h }[6], [x22]\n" + "st1 { v24.h }[6], [x21]\n" + "b 243f\n" + "240:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 243f\n" + "st1 { v8.h }[4], [x28]\n" + "st1 { v12.h }[4], [x24]\n" + "st1 { v16.h }[4], [x23]\n" + "st1 { v20.h }[4], [x22]\n" + "st1 { v24.h }[4], [x21]\n" + "b 243f\n" + "241:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 242f\n" + "str s8, [x28], #0x4\n" + "str s12, [x24], #0x4\n" + "str s16, [x23], #0x4\n" + "str s20, [x22], #0x4\n" + "str s24, [x21], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v8.h }[2], [x28]\n" + "st1 { v12.h }[2], [x24]\n" + "st1 { v16.h }[2], [x23]\n" + "st1 { v20.h }[2], [x22]\n" + "st1 { v24.h }[2], [x21]\n" + "b 243f\n" + "242:" // Height 5: Partial direct writeback: partial_1_0 + "str h8, [x28, #0x0]\n" + "str h12, [x24, #0x0]\n" + "str h16, [x23, #0x0]\n" + "str h20, [x22, #0x0]\n" + "str h24, [x21, #0x0]\n" + "243:" // Height 5: Partial direct writeback: Done + "b 245f\n" + "244:" // Height 5: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "245:" // Height 5: Writeback done + "subs x11, x11, #0x20\n" + "bgt 198b\n" + "b 296f\n" + "246:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0xc\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "247:" // Height 6: Column loop + "cbz x9, 248f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v24.16b, v8.16b\n" - "add x14, x14, #0x40\n" "mov v28.16b, v8.16b\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" "mov v21.16b, v9.16b\n" @@ -4122,314 +4051,315 @@ void a64_hybrid_fp16_mla_6x32 ( "mov v29.16b, v9.16b\n" "mov v30.16b, v10.16b\n" "mov v31.16b, v11.16b\n" - "b 273f\n" - "254:" // Height 6: no bias - "tbz %x[flags], #0, 272f\n" - "cmp x16, #0x20\n" - "bge 271f\n" - "tbz x16, #4, 262f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v20.8h }, [x25], #0x10\n" - "ld1 { v24.8h }, [x23], #0x10\n" - "ld1 { v28.8h }, [x21], #0x10\n" - "ld1 { v9.8h }, [x13], #0x10\n" - "ld1 { v13.8h }, [x9], #0x10\n" - "ld1 { v17.8h }, [x27], #0x10\n" - "ld1 { v21.8h }, [x25], #0x10\n" - "ld1 { v25.8h }, [x23], #0x10\n" - "ld1 { v29.8h }, [x21], #0x10\n" - "tbz x16, #3, 258f\n" - "ld1 { v10.8h }, [x13], #0x10\n" - "ld1 { v14.8h }, [x9], #0x10\n" - "ld1 { v18.8h }, [x27], #0x10\n" - "ld1 { v22.8h }, [x25], #0x10\n" - "ld1 { v26.8h }, [x23], #0x10\n" - "ld1 { v30.8h }, [x21], #0x10\n" - "tbz x16, #2, 256f\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d31, [x21], #0x8\n" - "tbz x16, #1, 255f\n" - "ld1 { v11.s }[2], [x13], #0x4\n" - "ld1 { v15.s }[2], [x9], #0x4\n" - "ld1 { v19.s }[2], [x27], #0x4\n" - "ld1 { v23.s }[2], [x25], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v31.s }[2], [x21], #0x4\n" + "b 267f\n" + "248:" // Height 6: no bias + "tbz %x[flags], #0, 266f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "bge 265f\n" + "tbz x11, #4, 256f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v20.8h }, [x22], #0x10\n" + "ld1 { v24.8h }, [x21], #0x10\n" + "ld1 { v28.8h }, [x20], #0x10\n" + "ld1 { v9.8h }, [x28], #0x10\n" + "ld1 { v13.8h }, [x24], #0x10\n" + "ld1 { v17.8h }, [x23], #0x10\n" + "ld1 { v21.8h }, [x22], #0x10\n" + "ld1 { v25.8h }, [x21], #0x10\n" + "ld1 { v29.8h }, [x20], #0x10\n" + "tbz x11, #3, 252f\n" + "ld1 { v10.8h }, [x28], #0x10\n" + "ld1 { v14.8h }, [x24], #0x10\n" + "ld1 { v18.8h }, [x23], #0x10\n" + "ld1 { v22.8h }, [x22], #0x10\n" + "ld1 { v26.8h }, [x21], #0x10\n" + "ld1 { v30.8h }, [x20], #0x10\n" + "tbz x11, #2, 250f\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x11, #1, 249f\n" + "ld1 { v11.s }[2], [x28], #0x4\n" "mov x19, #0x3c\n" - "tbz x16, #0, 270f\n" - "ld1 { v11.h }[6], [x13]\n" - "ld1 { v15.h }[6], [x9]\n" - "ld1 { v19.h }[6], [x27]\n" - "ld1 { v23.h }[6], [x25]\n" - "ld1 { v27.h }[6], [x23]\n" - "ld1 { v31.h }[6], [x21]\n" - "b 270f\n" - "255:" // Height 6: Partial accumulate: partial_1_28 + "ld1 { v15.s }[2], [x24], #0x4\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x22], #0x4\n" + "ld1 { v27.s }[2], [x21], #0x4\n" + "ld1 { v31.s }[2], [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v11.h }[6], [x28]\n" + "ld1 { v15.h }[6], [x24]\n" + "ld1 { v19.h }[6], [x23]\n" + "ld1 { v23.h }[6], [x22]\n" + "ld1 { v27.h }[6], [x21]\n" + "ld1 { v31.h }[6], [x20]\n" + "b 264f\n" + "249:" // Height 6: Partial accumulate: partial_1_28 "mov x19, #0x38\n" - "tbz x16, #0, 270f\n" - "ld1 { v11.h }[4], [x13]\n" - "ld1 { v15.h }[4], [x9]\n" - "ld1 { v19.h }[4], [x27]\n" - "ld1 { v23.h }[4], [x25]\n" - "ld1 { v27.h }[4], [x23]\n" - "ld1 { v31.h }[4], [x21]\n" - "b 270f\n" - "256:" // Height 6: Partial accumulate: partial_2_24 - "tbz x16, #1, 257f\n" - "ldr s11, [x13], #0x4\n" - "ldr s15, [x9], #0x4\n" - "ldr s19, [x27], #0x4\n" - "ldr s23, [x25], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s31, [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v11.h }[4], [x28]\n" + "ld1 { v15.h }[4], [x24]\n" + "ld1 { v19.h }[4], [x23]\n" + "ld1 { v23.h }[4], [x22]\n" + "ld1 { v27.h }[4], [x21]\n" + "ld1 { v31.h }[4], [x20]\n" + "b 264f\n" + "250:" // Height 6: Partial accumulate: partial_2_24 + "tbz x11, #1, 251f\n" + "ldr s11, [x28], #0x4\n" + "ldr s15, [x24], #0x4\n" "mov x19, #0x34\n" - "tbz x16, #0, 270f\n" - "ld1 { v11.h }[2], [x13]\n" - "ld1 { v15.h }[2], [x9]\n" - "ld1 { v19.h }[2], [x27]\n" - "ld1 { v23.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x23]\n" - "ld1 { v31.h }[2], [x21]\n" - "b 270f\n" - "257:" // Height 6: Partial accumulate: partial_1_24 + "ldr s19, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s27, [x21], #0x4\n" + "ldr s31, [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v11.h }[2], [x28]\n" + "ld1 { v15.h }[2], [x24]\n" + "ld1 { v19.h }[2], [x23]\n" + "ld1 { v23.h }[2], [x22]\n" + "ld1 { v27.h }[2], [x21]\n" + "ld1 { v31.h }[2], [x20]\n" + "b 264f\n" + "251:" // Height 6: Partial accumulate: partial_1_24 "mov x19, #0x30\n" - "tbz x16, #0, 270f\n" - "ldr h11, [x13, #0x0]\n" - "ldr h15, [x9, #0x0]\n" - "ldr h19, [x27, #0x0]\n" - "ldr h23, [x25, #0x0]\n" - "ldr h27, [x23, #0x0]\n" - "ldr h31, [x21, #0x0]\n" - "b 270f\n" - "258:" // Height 6: Partial accumulate: partial_4_16 - "tbz x16, #2, 260f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d30, [x21], #0x8\n" - "tbz x16, #1, 259f\n" - "ld1 { v10.s }[2], [x13], #0x4\n" - "ld1 { v14.s }[2], [x9], #0x4\n" - "ld1 { v18.s }[2], [x27], #0x4\n" - "ld1 { v22.s }[2], [x25], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" - "ld1 { v30.s }[2], [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ldr h11, [x28, #0x0]\n" + "ldr h15, [x24, #0x0]\n" + "ldr h19, [x23, #0x0]\n" + "ldr h23, [x22, #0x0]\n" + "ldr h27, [x21, #0x0]\n" + "ldr h31, [x20, #0x0]\n" + "b 264f\n" + "252:" // Height 6: Partial accumulate: partial_4_16 + "tbz x11, #2, 254f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x11, #1, 253f\n" + "ld1 { v10.s }[2], [x28], #0x4\n" "mov x19, #0x2c\n" - "tbz x16, #0, 270f\n" - "ld1 { v10.h }[6], [x13]\n" - "ld1 { v14.h }[6], [x9]\n" - "ld1 { v18.h }[6], [x27]\n" - "ld1 { v22.h }[6], [x25]\n" - "ld1 { v26.h }[6], [x23]\n" - "ld1 { v30.h }[6], [x21]\n" - "b 270f\n" - "259:" // Height 6: Partial accumulate: partial_1_20 + "ld1 { v14.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "ld1 { v22.s }[2], [x22], #0x4\n" + "ld1 { v26.s }[2], [x21], #0x4\n" + "ld1 { v30.s }[2], [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v10.h }[6], [x28]\n" + "ld1 { v14.h }[6], [x24]\n" + "ld1 { v18.h }[6], [x23]\n" + "ld1 { v22.h }[6], [x22]\n" + "ld1 { v26.h }[6], [x21]\n" + "ld1 { v30.h }[6], [x20]\n" + "b 264f\n" + "253:" // Height 6: Partial accumulate: partial_1_20 "mov x19, #0x28\n" - "tbz x16, #0, 270f\n" - "ld1 { v10.h }[4], [x13]\n" - "ld1 { v14.h }[4], [x9]\n" - "ld1 { v18.h }[4], [x27]\n" - "ld1 { v22.h }[4], [x25]\n" - "ld1 { v26.h }[4], [x23]\n" - "ld1 { v30.h }[4], [x21]\n" - "b 270f\n" - "260:" // Height 6: Partial accumulate: partial_2_16 - "tbz x16, #1, 261f\n" - "ldr s10, [x13], #0x4\n" - "ldr s14, [x9], #0x4\n" - "ldr s18, [x27], #0x4\n" - "ldr s22, [x25], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s30, [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v10.h }[4], [x28]\n" + "ld1 { v14.h }[4], [x24]\n" + "ld1 { v18.h }[4], [x23]\n" + "ld1 { v22.h }[4], [x22]\n" + "ld1 { v26.h }[4], [x21]\n" + "ld1 { v30.h }[4], [x20]\n" + "b 264f\n" + "254:" // Height 6: Partial accumulate: partial_2_16 + "tbz x11, #1, 255f\n" + "ldr s10, [x28], #0x4\n" + "ldr s14, [x24], #0x4\n" "mov x19, #0x24\n" - "tbz x16, #0, 270f\n" - "ld1 { v10.h }[2], [x13]\n" - "ld1 { v14.h }[2], [x9]\n" - "ld1 { v18.h }[2], [x27]\n" - "ld1 { v22.h }[2], [x25]\n" - "ld1 { v26.h }[2], [x23]\n" - "ld1 { v30.h }[2], [x21]\n" - "b 270f\n" - "261:" // Height 6: Partial accumulate: partial_1_16 + "ldr s18, [x23], #0x4\n" + "ldr s22, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "ldr s30, [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v10.h }[2], [x28]\n" + "ld1 { v14.h }[2], [x24]\n" + "ld1 { v18.h }[2], [x23]\n" + "ld1 { v22.h }[2], [x22]\n" + "ld1 { v26.h }[2], [x21]\n" + "ld1 { v30.h }[2], [x20]\n" + "b 264f\n" + "255:" // Height 6: Partial accumulate: partial_1_16 "mov x19, #0x20\n" - "tbz x16, #0, 270f\n" - "ldr h10, [x13, #0x0]\n" - "ldr h14, [x9, #0x0]\n" - "ldr h18, [x27, #0x0]\n" - "ldr h22, [x25, #0x0]\n" - "ldr h26, [x23, #0x0]\n" - "ldr h30, [x21, #0x0]\n" - "b 270f\n" - "262:" // Height 6: Partial accumulate: partial_8_0 - "tbz x16, #3, 266f\n" - "ld1 { v8.8h }, [x13], #0x10\n" - "ld1 { v12.8h }, [x9], #0x10\n" - "ld1 { v16.8h }, [x27], #0x10\n" - "ld1 { v20.8h }, [x25], #0x10\n" - "ld1 { v24.8h }, [x23], #0x10\n" - "ld1 { v28.8h }, [x21], #0x10\n" - "tbz x16, #2, 264f\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d29, [x21], #0x8\n" - "tbz x16, #1, 263f\n" - "ld1 { v9.s }[2], [x13], #0x4\n" - "ld1 { v13.s }[2], [x9], #0x4\n" - "ld1 { v17.s }[2], [x27], #0x4\n" - "ld1 { v21.s }[2], [x25], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v29.s }[2], [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ldr h10, [x28, #0x0]\n" + "ldr h14, [x24, #0x0]\n" + "ldr h18, [x23, #0x0]\n" + "ldr h22, [x22, #0x0]\n" + "ldr h26, [x21, #0x0]\n" + "ldr h30, [x20, #0x0]\n" + "b 264f\n" + "256:" // Height 6: Partial accumulate: partial_8_0 + "tbz x11, #3, 260f\n" + "ld1 { v8.8h }, [x28], #0x10\n" + "ld1 { v12.8h }, [x24], #0x10\n" + "ld1 { v16.8h }, [x23], #0x10\n" + "ld1 { v20.8h }, [x22], #0x10\n" + "ld1 { v24.8h }, [x21], #0x10\n" + "ld1 { v28.8h }, [x20], #0x10\n" + "tbz x11, #2, 258f\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x11, #1, 257f\n" + "ld1 { v9.s }[2], [x28], #0x4\n" "mov x19, #0x1c\n" - "tbz x16, #0, 270f\n" - "ld1 { v9.h }[6], [x13]\n" - "ld1 { v13.h }[6], [x9]\n" - "ld1 { v17.h }[6], [x27]\n" - "ld1 { v21.h }[6], [x25]\n" - "ld1 { v25.h }[6], [x23]\n" - "ld1 { v29.h }[6], [x21]\n" - "b 270f\n" - "263:" // Height 6: Partial accumulate: partial_1_12 + "ld1 { v13.s }[2], [x24], #0x4\n" + "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v25.s }[2], [x21], #0x4\n" + "ld1 { v29.s }[2], [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v9.h }[6], [x28]\n" + "ld1 { v13.h }[6], [x24]\n" + "ld1 { v17.h }[6], [x23]\n" + "ld1 { v21.h }[6], [x22]\n" + "ld1 { v25.h }[6], [x21]\n" + "ld1 { v29.h }[6], [x20]\n" + "b 264f\n" + "257:" // Height 6: Partial accumulate: partial_1_12 "mov x19, #0x18\n" - "tbz x16, #0, 270f\n" - "ld1 { v9.h }[4], [x13]\n" - "ld1 { v13.h }[4], [x9]\n" - "ld1 { v17.h }[4], [x27]\n" - "ld1 { v21.h }[4], [x25]\n" - "ld1 { v25.h }[4], [x23]\n" - "ld1 { v29.h }[4], [x21]\n" - "b 270f\n" - "264:" // Height 6: Partial accumulate: partial_2_8 - "tbz x16, #1, 265f\n" - "ldr s9, [x13], #0x4\n" - "ldr s13, [x9], #0x4\n" - "ldr s17, [x27], #0x4\n" - "ldr s21, [x25], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s29, [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v9.h }[4], [x28]\n" + "ld1 { v13.h }[4], [x24]\n" + "ld1 { v17.h }[4], [x23]\n" + "ld1 { v21.h }[4], [x22]\n" + "ld1 { v25.h }[4], [x21]\n" + "ld1 { v29.h }[4], [x20]\n" + "b 264f\n" + "258:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 259f\n" + "ldr s9, [x28], #0x4\n" + "ldr s13, [x24], #0x4\n" "mov x19, #0x14\n" - "tbz x16, #0, 270f\n" - "ld1 { v9.h }[2], [x13]\n" - "ld1 { v13.h }[2], [x9]\n" - "ld1 { v17.h }[2], [x27]\n" - "ld1 { v21.h }[2], [x25]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v29.h }[2], [x21]\n" - "b 270f\n" - "265:" // Height 6: Partial accumulate: partial_1_8 + "ldr s17, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s25, [x21], #0x4\n" + "ldr s29, [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v9.h }[2], [x28]\n" + "ld1 { v13.h }[2], [x24]\n" + "ld1 { v17.h }[2], [x23]\n" + "ld1 { v21.h }[2], [x22]\n" + "ld1 { v25.h }[2], [x21]\n" + "ld1 { v29.h }[2], [x20]\n" + "b 264f\n" + "259:" // Height 6: Partial accumulate: partial_1_8 "mov x19, #0x10\n" - "tbz x16, #0, 270f\n" - "ldr h9, [x13, #0x0]\n" - "ldr h13, [x9, #0x0]\n" - "ldr h17, [x27, #0x0]\n" - "ldr h21, [x25, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h29, [x21, #0x0]\n" - "b 270f\n" - "266:" // Height 6: Partial accumulate: partial_4_0 - "tbz x16, #2, 268f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d28, [x21], #0x8\n" - "tbz x16, #1, 267f\n" - "ld1 { v8.s }[2], [x13], #0x4\n" - "ld1 { v12.s }[2], [x9], #0x4\n" - "ld1 { v16.s }[2], [x27], #0x4\n" - "ld1 { v20.s }[2], [x25], #0x4\n" - "ld1 { v24.s }[2], [x23], #0x4\n" - "ld1 { v28.s }[2], [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ldr h9, [x28, #0x0]\n" + "ldr h13, [x24, #0x0]\n" + "ldr h17, [x23, #0x0]\n" + "ldr h21, [x22, #0x0]\n" + "ldr h25, [x21, #0x0]\n" + "ldr h29, [x20, #0x0]\n" + "b 264f\n" + "260:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 262f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x11, #1, 261f\n" + "ld1 { v8.s }[2], [x28], #0x4\n" "mov x19, #0xc\n" - "tbz x16, #0, 270f\n" - "ld1 { v8.h }[6], [x13]\n" - "ld1 { v12.h }[6], [x9]\n" - "ld1 { v16.h }[6], [x27]\n" - "ld1 { v20.h }[6], [x25]\n" - "ld1 { v24.h }[6], [x23]\n" - "ld1 { v28.h }[6], [x21]\n" - "b 270f\n" - "267:" // Height 6: Partial accumulate: partial_1_4 + "ld1 { v12.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "ld1 { v24.s }[2], [x21], #0x4\n" + "ld1 { v28.s }[2], [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v8.h }[6], [x28]\n" + "ld1 { v12.h }[6], [x24]\n" + "ld1 { v16.h }[6], [x23]\n" + "ld1 { v20.h }[6], [x22]\n" + "ld1 { v24.h }[6], [x21]\n" + "ld1 { v28.h }[6], [x20]\n" + "b 264f\n" + "261:" // Height 6: Partial accumulate: partial_1_4 "mov x19, #0x8\n" - "tbz x16, #0, 270f\n" - "ld1 { v8.h }[4], [x13]\n" - "ld1 { v12.h }[4], [x9]\n" - "ld1 { v16.h }[4], [x27]\n" - "ld1 { v20.h }[4], [x25]\n" - "ld1 { v24.h }[4], [x23]\n" - "ld1 { v28.h }[4], [x21]\n" - "b 270f\n" - "268:" // Height 6: Partial accumulate: partial_2_0 - "tbz x16, #1, 269f\n" - "ldr s8, [x13], #0x4\n" - "ldr s12, [x9], #0x4\n" - "ldr s16, [x27], #0x4\n" - "ldr s20, [x25], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s28, [x21], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v8.h }[4], [x28]\n" + "ld1 { v12.h }[4], [x24]\n" + "ld1 { v16.h }[4], [x23]\n" + "ld1 { v20.h }[4], [x22]\n" + "ld1 { v24.h }[4], [x21]\n" + "ld1 { v28.h }[4], [x20]\n" + "b 264f\n" + "262:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 263f\n" + "ldr s8, [x28], #0x4\n" + "ldr s12, [x24], #0x4\n" "mov x19, #0x4\n" - "tbz x16, #0, 270f\n" - "ld1 { v8.h }[2], [x13]\n" - "ld1 { v12.h }[2], [x9]\n" - "ld1 { v16.h }[2], [x27]\n" - "ld1 { v20.h }[2], [x25]\n" - "ld1 { v24.h }[2], [x23]\n" - "ld1 { v28.h }[2], [x21]\n" - "b 270f\n" - "269:" // Height 6: Partial accumulate: partial_1_0 + "ldr s16, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" + "ldr s28, [x20], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v8.h }[2], [x28]\n" + "ld1 { v12.h }[2], [x24]\n" + "ld1 { v16.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v24.h }[2], [x21]\n" + "ld1 { v28.h }[2], [x20]\n" + "b 264f\n" + "263:" // Height 6: Partial accumulate: partial_1_0 + "ldr h8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr h8, [x13, #0x0]\n" - "ldr h12, [x9, #0x0]\n" - "ldr h16, [x27, #0x0]\n" - "ldr h20, [x25, #0x0]\n" - "ldr h24, [x23, #0x0]\n" - "ldr h28, [x21, #0x0]\n" - "270:" // Height 6: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "sub x21, x21, x19\n" - "b 273f\n" - "271:" // Height 6: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "ldr q28, [x21, #0x0]\n" - "ldr q29, [x21, #0x10]\n" - "ldr q30, [x21, #0x20]\n" - "ldr q31, [x21, #0x30]\n" - "b 273f\n" - "272:" // Height 6: no accumulate + "ldr h12, [x24, #0x0]\n" + "ldr h16, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h24, [x21, #0x0]\n" + "ldr h28, [x20, #0x0]\n" + "264:" // Height 6: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 267f\n" + "265:" // Height 6: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x20, #0x0]\n" + "ldr q29, [x20, #0x10]\n" + "ldr q30, [x20, #0x20]\n" + "ldr q31, [x20, #0x30]\n" + "b 267f\n" + "266:" // Height 6: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -4454,523 +4384,523 @@ void a64_hybrid_fp16_mla_6x32 ( "movi v29.16b, #0x0\n" "movi v30.16b, #0x0\n" "movi v31.16b, #0x0\n" - "273:" // Height 6: setup done - "mov x12, #0x0\n" - "274:" // Height 6: String loop + "267:" // Height 6: setup done + "mov x27, #0x0\n" + "268:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 275f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 269f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 276f\n" + "cbnz x27, 270f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" "add x20, x20, x19, LSL #1\n" - "b 276f\n" - "275:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "add x20, x22, x19, LSL #1\n" - "276:" // Height 6: input setup done - "cmp x11, #0x8\n" - "blt 279f\n" - "cmp x11, #0x10\n" - "blt 278f\n" - "277:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" + "b 270f\n" + "269:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "270:" // Height 6: input setup done + "cmp x26, #0x8\n" + "blt 273f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x10\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 272f\n" + "271:" // Height 6: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x21, x21, #0x10\n" "fmla v28.8h, v6.8h, v5.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x20, x20, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "sub x26, x26, #0x8\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "add x20, x20, #0x10\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x11, x11, #0x8\n" + "cmp x26, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "cmp x11, #0x10\n" "fmla v25.8h, v7.8h, v4.h[0]\n" "fmla v29.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" "fmla v26.8h, v6.8h, v4.h[0]\n" "fmla v30.8h, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" "fmla v31.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "fmla v20.8h, v6.8h, v3.h[1]\n" "fmla v24.8h, v6.8h, v4.h[1]\n" "fmla v28.8h, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" "fmla v25.8h, v7.8h, v4.h[1]\n" "fmla v29.8h, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" "fmla v26.8h, v6.8h, v4.h[1]\n" "fmla v30.8h, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" "fmla v27.8h, v7.8h, v4.h[1]\n" "fmla v31.8h, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" "fmla v24.8h, v6.8h, v4.h[2]\n" "fmla v28.8h, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" "fmla v25.8h, v7.8h, v4.h[2]\n" "fmla v29.8h, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" "fmla v26.8h, v6.8h, v4.h[2]\n" "fmla v30.8h, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" "fmla v27.8h, v7.8h, v4.h[2]\n" "fmla v31.8h, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" "fmla v24.8h, v6.8h, v4.h[3]\n" "fmla v28.8h, v6.8h, v5.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" "fmla v25.8h, v7.8h, v4.h[3]\n" "fmla v29.8h, v7.8h, v5.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" "fmla v26.8h, v6.8h, v4.h[3]\n" "fmla v30.8h, v6.8h, v5.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" "fmla v27.8h, v7.8h, v4.h[3]\n" "fmla v31.8h, v7.8h, v5.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" "fmla v24.8h, v6.8h, v4.h[4]\n" "fmla v28.8h, v6.8h, v5.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" "fmla v25.8h, v7.8h, v4.h[4]\n" "fmla v29.8h, v7.8h, v5.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" "fmla v26.8h, v6.8h, v4.h[4]\n" "fmla v30.8h, v6.8h, v5.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" "fmla v27.8h, v7.8h, v4.h[4]\n" "fmla v31.8h, v7.8h, v5.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" "fmla v24.8h, v6.8h, v4.h[5]\n" "fmla v28.8h, v6.8h, v5.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" "fmla v25.8h, v7.8h, v4.h[5]\n" "fmla v29.8h, v7.8h, v5.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" "fmla v26.8h, v6.8h, v4.h[5]\n" "fmla v30.8h, v6.8h, v5.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" "fmla v27.8h, v7.8h, v4.h[5]\n" "fmla v31.8h, v7.8h, v5.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" "fmla v24.8h, v6.8h, v4.h[6]\n" "fmla v28.8h, v6.8h, v5.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" "fmla v25.8h, v7.8h, v4.h[6]\n" "fmla v29.8h, v7.8h, v5.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" "fmla v26.8h, v6.8h, v4.h[6]\n" "fmla v30.8h, v6.8h, v5.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" "fmla v27.8h, v7.8h, v4.h[6]\n" "fmla v31.8h, v7.8h, v5.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" "fmla v24.8h, v6.8h, v4.h[7]\n" "fmla v28.8h, v6.8h, v5.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" "fmla v25.8h, v7.8h, v4.h[7]\n" "fmla v29.8h, v7.8h, v5.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" - "add x15, x15, #0x200\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v22.8h, v6.8h, v3.h[7]\n" "fmla v26.8h, v6.8h, v4.h[7]\n" "fmla v30.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x23, #0x0]\n" "fmla v23.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x22, #0x0]\n" "fmla v27.8h, v7.8h, v4.h[7]\n" + "ldr q4, [x21, #0x0]\n" "fmla v31.8h, v7.8h, v5.h[7]\n" - "bge 277b\n" - "278:" // Height 6: Multiply loop: Single iteration only - "sub x11, x11, #0x8\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "bge 271b\n" + "272:" // Height 6: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x8\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x28, x28, #0x10\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - "fmla v28.8h, v6.8h, v5.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x15, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x23, x23, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" "add x22, x22, #0x10\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x21, x21, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q6, [x10, #0x20]\n" "add x20, x20, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" "fmla v29.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" "fmla v26.8h, v6.8h, v4.h[0]\n" "fmla v30.8h, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" "fmla v15.8h, v7.8h, v1.h[0]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" "fmla v31.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "fmla v20.8h, v6.8h, v3.h[1]\n" "fmla v24.8h, v6.8h, v4.h[1]\n" "fmla v28.8h, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" "fmla v13.8h, v7.8h, v1.h[1]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" "fmla v25.8h, v7.8h, v4.h[1]\n" "fmla v29.8h, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" "fmla v26.8h, v6.8h, v4.h[1]\n" "fmla v30.8h, v6.8h, v5.h[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" "fmla v15.8h, v7.8h, v1.h[1]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" "fmla v27.8h, v7.8h, v4.h[1]\n" "fmla v31.8h, v7.8h, v5.h[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" "fmla v24.8h, v6.8h, v4.h[2]\n" "fmla v28.8h, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" "fmla v13.8h, v7.8h, v1.h[2]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" "fmla v25.8h, v7.8h, v4.h[2]\n" "fmla v29.8h, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" "fmla v26.8h, v6.8h, v4.h[2]\n" "fmla v30.8h, v6.8h, v5.h[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" "fmla v15.8h, v7.8h, v1.h[2]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" "fmla v27.8h, v7.8h, v4.h[2]\n" "fmla v31.8h, v7.8h, v5.h[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" "fmla v24.8h, v6.8h, v4.h[3]\n" "fmla v28.8h, v6.8h, v5.h[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" "fmla v13.8h, v7.8h, v1.h[3]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" "fmla v25.8h, v7.8h, v4.h[3]\n" "fmla v29.8h, v7.8h, v5.h[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" "fmla v26.8h, v6.8h, v4.h[3]\n" "fmla v30.8h, v6.8h, v5.h[3]\n" - "ldr q6, [x15, #0x100]\n" + "ldr q6, [x10, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" "fmla v15.8h, v7.8h, v1.h[3]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" "fmla v27.8h, v7.8h, v4.h[3]\n" "fmla v31.8h, v7.8h, v5.h[3]\n" - "ldr q7, [x15, #0x110]\n" + "ldr q7, [x10, #0x110]\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" "fmla v24.8h, v6.8h, v4.h[4]\n" "fmla v28.8h, v6.8h, v5.h[4]\n" - "ldr q6, [x15, #0x120]\n" + "ldr q6, [x10, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" "fmla v13.8h, v7.8h, v1.h[4]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" "fmla v25.8h, v7.8h, v4.h[4]\n" "fmla v29.8h, v7.8h, v5.h[4]\n" - "ldr q7, [x15, #0x130]\n" + "ldr q7, [x10, #0x130]\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" "fmla v26.8h, v6.8h, v4.h[4]\n" "fmla v30.8h, v6.8h, v5.h[4]\n" - "ldr q6, [x15, #0x140]\n" + "ldr q6, [x10, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" "fmla v15.8h, v7.8h, v1.h[4]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" "fmla v27.8h, v7.8h, v4.h[4]\n" "fmla v31.8h, v7.8h, v5.h[4]\n" - "ldr q7, [x15, #0x150]\n" + "ldr q7, [x10, #0x150]\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" "fmla v24.8h, v6.8h, v4.h[5]\n" "fmla v28.8h, v6.8h, v5.h[5]\n" - "ldr q6, [x15, #0x160]\n" + "ldr q6, [x10, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" "fmla v13.8h, v7.8h, v1.h[5]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" "fmla v25.8h, v7.8h, v4.h[5]\n" "fmla v29.8h, v7.8h, v5.h[5]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q7, [x10, #0x170]\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" "fmla v26.8h, v6.8h, v4.h[5]\n" "fmla v30.8h, v6.8h, v5.h[5]\n" - "ldr q6, [x15, #0x180]\n" + "ldr q6, [x10, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" "fmla v15.8h, v7.8h, v1.h[5]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" "fmla v27.8h, v7.8h, v4.h[5]\n" "fmla v31.8h, v7.8h, v5.h[5]\n" - "ldr q7, [x15, #0x190]\n" + "ldr q7, [x10, #0x190]\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" "fmla v24.8h, v6.8h, v4.h[6]\n" "fmla v28.8h, v6.8h, v5.h[6]\n" - "ldr q6, [x15, #0x1a0]\n" + "ldr q6, [x10, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" "fmla v13.8h, v7.8h, v1.h[6]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" "fmla v25.8h, v7.8h, v4.h[6]\n" "fmla v29.8h, v7.8h, v5.h[6]\n" - "ldr q7, [x15, #0x1b0]\n" + "ldr q7, [x10, #0x1b0]\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" "fmla v26.8h, v6.8h, v4.h[6]\n" "fmla v30.8h, v6.8h, v5.h[6]\n" - "ldr q6, [x15, #0x1c0]\n" + "ldr q6, [x10, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" "fmla v15.8h, v7.8h, v1.h[6]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" "fmla v27.8h, v7.8h, v4.h[6]\n" "fmla v31.8h, v7.8h, v5.h[6]\n" - "ldr q7, [x15, #0x1d0]\n" + "ldr q7, [x10, #0x1d0]\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" "fmla v24.8h, v6.8h, v4.h[7]\n" "fmla v28.8h, v6.8h, v5.h[7]\n" - "ldr q6, [x15, #0x1e0]\n" + "ldr q6, [x10, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" "fmla v25.8h, v7.8h, v4.h[7]\n" "fmla v29.8h, v7.8h, v5.h[7]\n" - "ldr q7, [x15, #0x1f0]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" - "add x15, x15, #0x200\n" "fmla v14.8h, v6.8h, v1.h[7]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v22.8h, v6.8h, v3.h[7]\n" @@ -4982,34 +4912,34 @@ void a64_hybrid_fp16_mla_6x32 ( "fmla v23.8h, v7.8h, v3.h[7]\n" "fmla v27.8h, v7.8h, v4.h[7]\n" "fmla v31.8h, v7.8h, v5.h[7]\n" - "279:" // Height 6: Multiply loop: Main loop skip - "cbz x11, 281f\n" - "280:" // Height 6: Multiply loop: Odd block loop - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" + "273:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 275f\n" + "274:" // Height 6: Multiply loop: Odd block loop + "ldr h0, [x25], #0x2\n" + "sub x26, x26, #0x1\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" "ldr h5, [x20], #0x2\n" - "ldr q6, [x15, #0x0]\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" "fmla v24.8h, v6.8h, v4.h[0]\n" "fmla v28.8h, v6.8h, v5.h[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" "fmla v29.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "add x15, x15, #0x40\n" "fmla v14.8h, v6.8h, v1.h[0]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" @@ -5021,19 +4951,25 @@ void a64_hybrid_fp16_mla_6x32 ( "fmla v23.8h, v7.8h, v3.h[0]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" "fmla v31.8h, v7.8h, v5.h[0]\n" - "cbnz x11, 280b\n" - "281:" // Height 6: Multiply loop: No odd multiplies + "cbnz x26, 274b\n" + "275:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 274b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 268b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #1\n" "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #1\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #1\n" "prfm pstl1keep, [x21, #0x0]\n" - "tbz %x[flags], #1, 282f\n" + "add x20, x21, x19, LSL #1\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbz %x[flags], #1, 276f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.8h }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -5086,313 +5022,308 @@ void a64_hybrid_fp16_mla_6x32 ( "fmin v31.8h, v31.8h, v0.8h\n" "fmax v30.8h, v30.8h, v1.8h\n" "fmax v31.8h, v31.8h, v1.8h\n" - "282:" // Height 6: No activation - "cmp x16, #0x20\n" - "bge 299f\n" - "tbz x16, #4, 290f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v9.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v13.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v17.8h }, [x27], #0x10\n" - "st1 { v20.8h }, [x25], #0x10\n" - "st1 { v21.8h }, [x25], #0x10\n" - "st1 { v24.8h }, [x23], #0x10\n" - "st1 { v25.8h }, [x23], #0x10\n" - "st1 { v28.8h }, [x21], #0x10\n" - "st1 { v29.8h }, [x21], #0x10\n" - "tbz x16, #3, 286f\n" - "st1 { v10.8h }, [x13], #0x10\n" - "st1 { v14.8h }, [x9], #0x10\n" - "st1 { v18.8h }, [x27], #0x10\n" - "st1 { v22.8h }, [x25], #0x10\n" - "st1 { v26.8h }, [x23], #0x10\n" - "st1 { v30.8h }, [x21], #0x10\n" - "tbz x16, #2, 284f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x16, #1, 283f\n" - "st1 { v11.s }[2], [x13], #0x4\n" - "st1 { v15.s }[2], [x9], #0x4\n" - "st1 { v19.s }[2], [x27], #0x4\n" - "st1 { v23.s }[2], [x25], #0x4\n" - "st1 { v27.s }[2], [x23], #0x4\n" - "st1 { v31.s }[2], [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v11.h }[6], [x13]\n" - "st1 { v15.h }[6], [x9]\n" - "st1 { v19.h }[6], [x27]\n" - "st1 { v23.h }[6], [x25]\n" - "st1 { v27.h }[6], [x23]\n" - "st1 { v31.h }[6], [x21]\n" - "b 298f\n" - "283:" // Height 6: Partial direct writeback: partial_1_28 - "tbz x16, #0, 298f\n" - "st1 { v11.h }[4], [x13]\n" - "st1 { v15.h }[4], [x9]\n" - "st1 { v19.h }[4], [x27]\n" - "st1 { v23.h }[4], [x25]\n" - "st1 { v27.h }[4], [x23]\n" - "st1 { v31.h }[4], [x21]\n" - "b 298f\n" - "284:" // Height 6: Partial direct writeback: partial_2_24 - "tbz x16, #1, 285f\n" - "str s11, [x13], #0x4\n" - "str s15, [x9], #0x4\n" - "str s19, [x27], #0x4\n" - "str s23, [x25], #0x4\n" - "str s27, [x23], #0x4\n" - "str s31, [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v11.h }[2], [x13]\n" - "st1 { v15.h }[2], [x9]\n" - "st1 { v19.h }[2], [x27]\n" - "st1 { v23.h }[2], [x25]\n" - "st1 { v27.h }[2], [x23]\n" - "st1 { v31.h }[2], [x21]\n" - "b 298f\n" - "285:" // Height 6: Partial direct writeback: partial_1_24 - "tbz x16, #0, 298f\n" - "str h11, [x13, #0x0]\n" - "str h15, [x9, #0x0]\n" - "str h19, [x27, #0x0]\n" - "str h23, [x25, #0x0]\n" - "str h27, [x23, #0x0]\n" - "str h31, [x21, #0x0]\n" - "b 298f\n" - "286:" // Height 6: Partial direct writeback: partial_4_16 - "tbz x16, #2, 288f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "str d30, [x21], #0x8\n" - "tbz x16, #1, 287f\n" - "st1 { v10.s }[2], [x13], #0x4\n" - "st1 { v14.s }[2], [x9], #0x4\n" - "st1 { v18.s }[2], [x27], #0x4\n" - "st1 { v22.s }[2], [x25], #0x4\n" - "st1 { v26.s }[2], [x23], #0x4\n" - "st1 { v30.s }[2], [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v10.h }[6], [x13]\n" - "st1 { v14.h }[6], [x9]\n" - "st1 { v18.h }[6], [x27]\n" - "st1 { v22.h }[6], [x25]\n" - "st1 { v26.h }[6], [x23]\n" - "st1 { v30.h }[6], [x21]\n" - "b 298f\n" - "287:" // Height 6: Partial direct writeback: partial_1_20 - "tbz x16, #0, 298f\n" - "st1 { v10.h }[4], [x13]\n" - "st1 { v14.h }[4], [x9]\n" - "st1 { v18.h }[4], [x27]\n" - "st1 { v22.h }[4], [x25]\n" - "st1 { v26.h }[4], [x23]\n" - "st1 { v30.h }[4], [x21]\n" - "b 298f\n" - "288:" // Height 6: Partial direct writeback: partial_2_16 - "tbz x16, #1, 289f\n" - "str s10, [x13], #0x4\n" - "str s14, [x9], #0x4\n" - "str s18, [x27], #0x4\n" - "str s22, [x25], #0x4\n" - "str s26, [x23], #0x4\n" - "str s30, [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v10.h }[2], [x13]\n" - "st1 { v14.h }[2], [x9]\n" - "st1 { v18.h }[2], [x27]\n" - "st1 { v22.h }[2], [x25]\n" - "st1 { v26.h }[2], [x23]\n" - "st1 { v30.h }[2], [x21]\n" - "b 298f\n" - "289:" // Height 6: Partial direct writeback: partial_1_16 - "tbz x16, #0, 298f\n" - "str h10, [x13, #0x0]\n" - "str h14, [x9, #0x0]\n" - "str h18, [x27, #0x0]\n" - "str h22, [x25, #0x0]\n" - "str h26, [x23, #0x0]\n" - "str h30, [x21, #0x0]\n" - "b 298f\n" - "290:" // Height 6: Partial direct writeback: partial_8_0 - "tbz x16, #3, 294f\n" - "st1 { v8.8h }, [x13], #0x10\n" - "st1 { v12.8h }, [x9], #0x10\n" - "st1 { v16.8h }, [x27], #0x10\n" - "st1 { v20.8h }, [x25], #0x10\n" - "st1 { v24.8h }, [x23], #0x10\n" - "st1 { v28.8h }, [x21], #0x10\n" - "tbz x16, #2, 292f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "str d29, [x21], #0x8\n" - "tbz x16, #1, 291f\n" - "st1 { v9.s }[2], [x13], #0x4\n" - "st1 { v13.s }[2], [x9], #0x4\n" - "st1 { v17.s }[2], [x27], #0x4\n" - "st1 { v21.s }[2], [x25], #0x4\n" - "st1 { v25.s }[2], [x23], #0x4\n" - "st1 { v29.s }[2], [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v9.h }[6], [x13]\n" - "st1 { v13.h }[6], [x9]\n" - "st1 { v17.h }[6], [x27]\n" - "st1 { v21.h }[6], [x25]\n" - "st1 { v25.h }[6], [x23]\n" - "st1 { v29.h }[6], [x21]\n" - "b 298f\n" - "291:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x16, #0, 298f\n" - "st1 { v9.h }[4], [x13]\n" - "st1 { v13.h }[4], [x9]\n" - "st1 { v17.h }[4], [x27]\n" - "st1 { v21.h }[4], [x25]\n" - "st1 { v25.h }[4], [x23]\n" - "st1 { v29.h }[4], [x21]\n" - "b 298f\n" - "292:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x16, #1, 293f\n" - "str s9, [x13], #0x4\n" - "str s13, [x9], #0x4\n" - "str s17, [x27], #0x4\n" - "str s21, [x25], #0x4\n" - "str s25, [x23], #0x4\n" - "str s29, [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v9.h }[2], [x13]\n" - "st1 { v13.h }[2], [x9]\n" - "st1 { v17.h }[2], [x27]\n" - "st1 { v21.h }[2], [x25]\n" - "st1 { v25.h }[2], [x23]\n" - "st1 { v29.h }[2], [x21]\n" - "b 298f\n" - "293:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x16, #0, 298f\n" - "str h9, [x13, #0x0]\n" - "str h13, [x9, #0x0]\n" - "str h17, [x27, #0x0]\n" - "str h21, [x25, #0x0]\n" - "str h25, [x23, #0x0]\n" - "str h29, [x21, #0x0]\n" - "b 298f\n" - "294:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x16, #2, 296f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x16, #1, 295f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v8.h }[6], [x13]\n" - "st1 { v12.h }[6], [x9]\n" - "st1 { v16.h }[6], [x27]\n" - "st1 { v20.h }[6], [x25]\n" - "st1 { v24.h }[6], [x23]\n" - "st1 { v28.h }[6], [x21]\n" - "b 298f\n" - "295:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x16, #0, 298f\n" - "st1 { v8.h }[4], [x13]\n" - "st1 { v12.h }[4], [x9]\n" - "st1 { v16.h }[4], [x27]\n" - "st1 { v20.h }[4], [x25]\n" - "st1 { v24.h }[4], [x23]\n" - "st1 { v28.h }[4], [x21]\n" - "b 298f\n" - "296:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x16, #1, 297f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x21], #0x4\n" - "tbz x16, #0, 298f\n" - "st1 { v8.h }[2], [x13]\n" - "st1 { v12.h }[2], [x9]\n" - "st1 { v16.h }[2], [x27]\n" - "st1 { v20.h }[2], [x25]\n" - "st1 { v24.h }[2], [x23]\n" - "st1 { v28.h }[2], [x21]\n" - "b 298f\n" - "297:" // Height 6: Partial direct writeback: partial_1_0 - "str h8, [x13, #0x0]\n" - "str h12, [x9, #0x0]\n" - "str h16, [x27, #0x0]\n" - "str h20, [x25, #0x0]\n" - "str h24, [x23, #0x0]\n" - "str h28, [x21, #0x0]\n" - "298:" // Height 6: Partial direct writeback: Done - "b 300f\n" - "299:" // Height 6: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "str q28, [x21, #0x0]\n" - "str q29, [x21, #0x10]\n" - "str q30, [x21, #0x20]\n" - "str q31, [x21, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "add x21, x21, #0x40\n" - "300:" // Height 6: Writeback done - "subs x16, x16, #0x20\n" - "bgt 253b\n" + "276:" // Height 6: No activation + "cmp x11, #0x20\n" + "bge 293f\n" + "tbz x11, #4, 284f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v9.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v13.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v17.8h }, [x23], #0x10\n" + "st1 { v20.8h }, [x22], #0x10\n" + "st1 { v21.8h }, [x22], #0x10\n" + "st1 { v24.8h }, [x21], #0x10\n" + "st1 { v25.8h }, [x21], #0x10\n" + "st1 { v28.8h }, [x20], #0x10\n" + "st1 { v29.8h }, [x20], #0x10\n" + "tbz x11, #3, 280f\n" + "st1 { v10.8h }, [x28], #0x10\n" + "st1 { v14.8h }, [x24], #0x10\n" + "st1 { v18.8h }, [x23], #0x10\n" + "st1 { v22.8h }, [x22], #0x10\n" + "st1 { v26.8h }, [x21], #0x10\n" + "st1 { v30.8h }, [x20], #0x10\n" + "tbz x11, #2, 278f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "str d31, [x20], #0x8\n" + "tbz x11, #1, 277f\n" + "st1 { v11.s }[2], [x28], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v19.s }[2], [x23], #0x4\n" + "st1 { v23.s }[2], [x22], #0x4\n" + "st1 { v27.s }[2], [x21], #0x4\n" + "st1 { v31.s }[2], [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v11.h }[6], [x28]\n" + "st1 { v15.h }[6], [x24]\n" + "st1 { v19.h }[6], [x23]\n" + "st1 { v23.h }[6], [x22]\n" + "st1 { v27.h }[6], [x21]\n" + "st1 { v31.h }[6], [x20]\n" + "b 292f\n" + "277:" // Height 6: Partial direct writeback: partial_1_28 + "tbz x11, #0, 292f\n" + "st1 { v11.h }[4], [x28]\n" + "st1 { v15.h }[4], [x24]\n" + "st1 { v19.h }[4], [x23]\n" + "st1 { v23.h }[4], [x22]\n" + "st1 { v27.h }[4], [x21]\n" + "st1 { v31.h }[4], [x20]\n" + "b 292f\n" + "278:" // Height 6: Partial direct writeback: partial_2_24 + "tbz x11, #1, 279f\n" + "str s11, [x28], #0x4\n" + "str s15, [x24], #0x4\n" + "str s19, [x23], #0x4\n" + "str s23, [x22], #0x4\n" + "str s27, [x21], #0x4\n" + "str s31, [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v11.h }[2], [x28]\n" + "st1 { v15.h }[2], [x24]\n" + "st1 { v19.h }[2], [x23]\n" + "st1 { v23.h }[2], [x22]\n" + "st1 { v27.h }[2], [x21]\n" + "st1 { v31.h }[2], [x20]\n" + "b 292f\n" + "279:" // Height 6: Partial direct writeback: partial_1_24 + "tbz x11, #0, 292f\n" + "str h11, [x28, #0x0]\n" + "str h15, [x24, #0x0]\n" + "str h19, [x23, #0x0]\n" + "str h23, [x22, #0x0]\n" + "str h27, [x21, #0x0]\n" + "str h31, [x20, #0x0]\n" + "b 292f\n" + "280:" // Height 6: Partial direct writeback: partial_4_16 + "tbz x11, #2, 282f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "str d30, [x20], #0x8\n" + "tbz x11, #1, 281f\n" + "st1 { v10.s }[2], [x28], #0x4\n" + "st1 { v14.s }[2], [x24], #0x4\n" + "st1 { v18.s }[2], [x23], #0x4\n" + "st1 { v22.s }[2], [x22], #0x4\n" + "st1 { v26.s }[2], [x21], #0x4\n" + "st1 { v30.s }[2], [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v10.h }[6], [x28]\n" + "st1 { v14.h }[6], [x24]\n" + "st1 { v18.h }[6], [x23]\n" + "st1 { v22.h }[6], [x22]\n" + "st1 { v26.h }[6], [x21]\n" + "st1 { v30.h }[6], [x20]\n" + "b 292f\n" + "281:" // Height 6: Partial direct writeback: partial_1_20 + "tbz x11, #0, 292f\n" + "st1 { v10.h }[4], [x28]\n" + "st1 { v14.h }[4], [x24]\n" + "st1 { v18.h }[4], [x23]\n" + "st1 { v22.h }[4], [x22]\n" + "st1 { v26.h }[4], [x21]\n" + "st1 { v30.h }[4], [x20]\n" + "b 292f\n" + "282:" // Height 6: Partial direct writeback: partial_2_16 + "tbz x11, #1, 283f\n" + "str s10, [x28], #0x4\n" + "str s14, [x24], #0x4\n" + "str s18, [x23], #0x4\n" + "str s22, [x22], #0x4\n" + "str s26, [x21], #0x4\n" + "str s30, [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v10.h }[2], [x28]\n" + "st1 { v14.h }[2], [x24]\n" + "st1 { v18.h }[2], [x23]\n" + "st1 { v22.h }[2], [x22]\n" + "st1 { v26.h }[2], [x21]\n" + "st1 { v30.h }[2], [x20]\n" + "b 292f\n" + "283:" // Height 6: Partial direct writeback: partial_1_16 + "tbz x11, #0, 292f\n" + "str h10, [x28, #0x0]\n" + "str h14, [x24, #0x0]\n" + "str h18, [x23, #0x0]\n" + "str h22, [x22, #0x0]\n" + "str h26, [x21, #0x0]\n" + "str h30, [x20, #0x0]\n" + "b 292f\n" + "284:" // Height 6: Partial direct writeback: partial_8_0 + "tbz x11, #3, 288f\n" + "st1 { v8.8h }, [x28], #0x10\n" + "st1 { v12.8h }, [x24], #0x10\n" + "st1 { v16.8h }, [x23], #0x10\n" + "st1 { v20.8h }, [x22], #0x10\n" + "st1 { v24.8h }, [x21], #0x10\n" + "st1 { v28.8h }, [x20], #0x10\n" + "tbz x11, #2, 286f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "str d29, [x20], #0x8\n" + "tbz x11, #1, 285f\n" + "st1 { v9.s }[2], [x28], #0x4\n" + "st1 { v13.s }[2], [x24], #0x4\n" + "st1 { v17.s }[2], [x23], #0x4\n" + "st1 { v21.s }[2], [x22], #0x4\n" + "st1 { v25.s }[2], [x21], #0x4\n" + "st1 { v29.s }[2], [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v9.h }[6], [x28]\n" + "st1 { v13.h }[6], [x24]\n" + "st1 { v17.h }[6], [x23]\n" + "st1 { v21.h }[6], [x22]\n" + "st1 { v25.h }[6], [x21]\n" + "st1 { v29.h }[6], [x20]\n" + "b 292f\n" + "285:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 292f\n" + "st1 { v9.h }[4], [x28]\n" + "st1 { v13.h }[4], [x24]\n" + "st1 { v17.h }[4], [x23]\n" + "st1 { v21.h }[4], [x22]\n" + "st1 { v25.h }[4], [x21]\n" + "st1 { v29.h }[4], [x20]\n" + "b 292f\n" + "286:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 287f\n" + "str s9, [x28], #0x4\n" + "str s13, [x24], #0x4\n" + "str s17, [x23], #0x4\n" + "str s21, [x22], #0x4\n" + "str s25, [x21], #0x4\n" + "str s29, [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v9.h }[2], [x28]\n" + "st1 { v13.h }[2], [x24]\n" + "st1 { v17.h }[2], [x23]\n" + "st1 { v21.h }[2], [x22]\n" + "st1 { v25.h }[2], [x21]\n" + "st1 { v29.h }[2], [x20]\n" + "b 292f\n" + "287:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 292f\n" + "str h9, [x28, #0x0]\n" + "str h13, [x24, #0x0]\n" + "str h17, [x23, #0x0]\n" + "str h21, [x22, #0x0]\n" + "str h25, [x21, #0x0]\n" + "str h29, [x20, #0x0]\n" + "b 292f\n" + "288:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 290f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "str d28, [x20], #0x8\n" + "tbz x11, #1, 289f\n" + "st1 { v8.s }[2], [x28], #0x4\n" + "st1 { v12.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v20.s }[2], [x22], #0x4\n" + "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v28.s }[2], [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v8.h }[6], [x28]\n" + "st1 { v12.h }[6], [x24]\n" + "st1 { v16.h }[6], [x23]\n" + "st1 { v20.h }[6], [x22]\n" + "st1 { v24.h }[6], [x21]\n" + "st1 { v28.h }[6], [x20]\n" + "b 292f\n" + "289:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 292f\n" + "st1 { v8.h }[4], [x28]\n" + "st1 { v12.h }[4], [x24]\n" + "st1 { v16.h }[4], [x23]\n" + "st1 { v20.h }[4], [x22]\n" + "st1 { v24.h }[4], [x21]\n" + "st1 { v28.h }[4], [x20]\n" + "b 292f\n" + "290:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 291f\n" + "str s8, [x28], #0x4\n" + "str s12, [x24], #0x4\n" + "str s16, [x23], #0x4\n" + "str s20, [x22], #0x4\n" + "str s24, [x21], #0x4\n" + "str s28, [x20], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v8.h }[2], [x28]\n" + "st1 { v12.h }[2], [x24]\n" + "st1 { v16.h }[2], [x23]\n" + "st1 { v20.h }[2], [x22]\n" + "st1 { v24.h }[2], [x21]\n" + "st1 { v28.h }[2], [x20]\n" + "b 292f\n" + "291:" // Height 6: Partial direct writeback: partial_1_0 + "str h8, [x28, #0x0]\n" + "str h12, [x24, #0x0]\n" + "str h16, [x23, #0x0]\n" + "str h20, [x22, #0x0]\n" + "str h24, [x21, #0x0]\n" + "str h28, [x20, #0x0]\n" + "292:" // Height 6: Partial direct writeback: Done + "b 294f\n" + "293:" // Height 6: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "str q28, [x20, #0x0]\n" + "str q29, [x20, #0x10]\n" + "str q30, [x20, #0x20]\n" + "str q31, [x20, #0x30]\n" + "294:" // Height 6: Writeback done + "subs x11, x11, #0x20\n" + "bgt 247b\n" "subs %x[M], %x[M], #0x6\n" - "beq 302f\n" + "beq 296f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 301f\n" + "tbz %x[flags], #3, 295f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "301:" // Update direct input + "295:" // Update direct input "mov x19, #0xc\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "302:" // Exit + "296:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp index b4c1ba988f..7f83e617c5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_fp32_mla_6x16( ARGLIST ); +void a64_hybrid_fp32_mla_6x16_a55( ARGLIST ); class cls_a64_hybrid_fp32_mla_6x16 { @@ -72,10 +72,11 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: - return { 2.287 }; + return { 3.04 }; case CPUModel::A53: return { 1.43 }; case CPUModel::A73: @@ -87,9 +88,16 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_fp32_mla_6x16; - - cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *) + cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + case CPUModel::A53: + kernel=a64_hybrid_fp32_mla_6x16_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp new file mode 100644 index 0000000000..184cfaf95c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp @@ -0,0 +1,3595 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_6x16_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 166f\n" + "cmp %x[M], #0x4\n" + "bgt 133f\n" + "beq 100f\n" + "cmp %x[M], #0x2\n" + "bgt 67f\n" + "beq 34f\n" + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x16, %x[bias]\n" + "mov x15, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x16, 3f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "b 14f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 13f\n" + "cmp x8, #0x10\n" + "bge 12f\n" + "tbz x8, #3, 7f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v9.4s }, [x15], #0x10\n" + "tbz x8, #2, 5f\n" + "ld1 { v10.4s }, [x15], #0x10\n" + "tbz x8, #1, 4f\n" + "mov x19, #0x38\n" + "ldr d11, [x15], #0x8\n" + "tbz x8, #0, 11f\n" + "ld1 { v11.s }[2], [x15]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x8, #0, 11f\n" + "ldr s11, [x15, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x8, #1, 6f\n" + "ldr d10, [x15], #0x8\n" + "mov x19, #0x28\n" + "tbz x8, #0, 11f\n" + "ld1 { v10.s }[2], [x15]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x8, #0, 11f\n" + "ldr s10, [x15, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x8, #2, 9f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "tbz x8, #1, 8f\n" + "mov x19, #0x18\n" + "ldr d9, [x15], #0x8\n" + "tbz x8, #0, 11f\n" + "ld1 { v9.s }[2], [x15]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x8, #0, 11f\n" + "ldr s9, [x15, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x8, #1, 10f\n" + "ldr d8, [x15], #0x8\n" + "mov x19, #0x8\n" + "tbz x8, #0, 11f\n" + "ld1 { v8.s }[2], [x15]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "11:" // Height 1: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 14f\n" + "12:" // Height 1: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "b 14f\n" + "13:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "14:" // Height 1: setup done + "mov x14, #0x0\n" + "15:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "cbnz x14, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #2\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x12, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x13, #0x4\n" + "blt 20f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "cmp x13, #0x8\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d7, [x17, #0x10]\n" + "ldr x11, [x17, #0x18]\n" + "add x12, x12, #0x10\n" + "ldr d6, [x17, #0x20]\n" + "sub x13, x13, #0x4\n" + "ldr x10, [x17, #0x28]\n" + "cmp x13, #0x8\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr d6, [x17, #0x40]\n" + "ldr x10, [x17, #0x48]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x58]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr d6, [x17, #0x60]\n" + "ldr x10, [x17, #0x68]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x78]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr d6, [x17, #0x80]\n" + "ldr x10, [x17, #0x88]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x98]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr d6, [x17, #0xa0]\n" + "ldr x10, [x17, #0xa8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr d6, [x17, #0xc0]\n" + "ldr x10, [x17, #0xc8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr d6, [x17, #0xe0]\n" + "ldr x10, [x17, #0xe8]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xf0]\n" + "add x17, x17, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "ldr d6, [x17, #0x0]\n" + "ldr x10, [x17, #0x8]\n" + "mov v7.d[1], x11\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "mov v6.d[1], x10\n" + "ldr d0, [x12, #0x0]\n" + "mov v0.d[1], x9\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x10]\n" + "ldr q6, [x17, #0x20]\n" + "sub x13, x13, #0x4\n" + "add x12, x12, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x13, 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" + "sub x13, x13, #0x1\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "cbnz x13, 21b\n" + "22:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 15b\n" + "prfm pstl1keep, [x15, #0x0]\n" + "tbz %x[flags], #1, 23f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "23:" // Height 1: No activation + "cmp x8, #0x10\n" + "bge 32f\n" + "tbz x8, #3, 27f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v9.4s }, [x15], #0x10\n" + "tbz x8, #2, 25f\n" + "st1 { v10.4s }, [x15], #0x10\n" + "tbz x8, #1, 24f\n" + "str d11, [x15], #0x8\n" + "tbz x8, #0, 31f\n" + "st1 { v11.s }[2], [x15]\n" + "b 31f\n" + "24:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x8, #0, 31f\n" + "str s11, [x15, #0x0]\n" + "b 31f\n" + "25:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x8, #1, 26f\n" + "str d10, [x15], #0x8\n" + "tbz x8, #0, 31f\n" + "st1 { v10.s }[2], [x15]\n" + "b 31f\n" + "26:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x8, #0, 31f\n" + "str s10, [x15, #0x0]\n" + "b 31f\n" + "27:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x8, #2, 29f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "tbz x8, #1, 28f\n" + "str d9, [x15], #0x8\n" + "tbz x8, #0, 31f\n" + "st1 { v9.s }[2], [x15]\n" + "b 31f\n" + "28:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x8, #0, 31f\n" + "str s9, [x15, #0x0]\n" + "b 31f\n" + "29:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x8, #1, 30f\n" + "str d8, [x15], #0x8\n" + "tbz x8, #0, 31f\n" + "st1 { v8.s }[2], [x15]\n" + "b 31f\n" + "30:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x15, #0x0]\n" + "31:" // Height 1: Partial direct writeback: Done + "b 33f\n" + "32:" // Height 1: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "33:" // Height 1: Writeback done + "subs x8, x8, #0x10\n" + "bgt 2b\n" + "b 200f\n" + "34:" // Height 2 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "35:" // Height 2: Column loop + "cbz x16, 36f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "ldr q11, [x16, #0x30]\n" + "mov v13.16b, v9.16b\n" + "add x16, x16, #0x40\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "b 47f\n" + "36:" // Height 2: no bias + "tbz %x[flags], #0, 46f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x10\n" + "add x25, x15, x19, LSL #2\n" + "bge 45f\n" + "tbz x8, #3, 40f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v9.4s }, [x15], #0x10\n" + "ld1 { v13.4s }, [x25], #0x10\n" + "tbz x8, #2, 38f\n" + "ld1 { v10.4s }, [x15], #0x10\n" + "ld1 { v14.4s }, [x25], #0x10\n" + "tbz x8, #1, 37f\n" + "mov x19, #0x38\n" + "ldr d11, [x15], #0x8\n" + "ldr d15, [x25], #0x8\n" + "tbz x8, #0, 44f\n" + "ld1 { v11.s }[2], [x15]\n" + "ld1 { v15.s }[2], [x25]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x8, #0, 44f\n" + "ldr s11, [x15, #0x0]\n" + "ldr s15, [x25, #0x0]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x8, #1, 39f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "mov x19, #0x28\n" + "tbz x8, #0, 44f\n" + "ld1 { v10.s }[2], [x15]\n" + "ld1 { v14.s }[2], [x25]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x8, #0, 44f\n" + "ldr s10, [x15, #0x0]\n" + "ldr s14, [x25, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x8, #2, 42f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "tbz x8, #1, 41f\n" + "mov x19, #0x18\n" + "ldr d9, [x15], #0x8\n" + "ldr d13, [x25], #0x8\n" + "tbz x8, #0, 44f\n" + "ld1 { v9.s }[2], [x15]\n" + "ld1 { v13.s }[2], [x25]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x8, #0, 44f\n" + "ldr s9, [x15, #0x0]\n" + "ldr s13, [x25, #0x0]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x8, #1, 43f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "mov x19, #0x8\n" + "tbz x8, #0, 44f\n" + "ld1 { v8.s }[2], [x15]\n" + "ld1 { v12.s }[2], [x25]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x25, #0x0]\n" + "44:" // Height 2: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "b 47f\n" + "46:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "47:" // Height 2: setup done + "mov x14, #0x0\n" + "48:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x14, 50f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #2\n" + "50:" // Height 2: input setup done + "cmp x13, #0x4\n" + "blt 53f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x8\n" + "ldr q6, [x17, #0x0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr x11, [x17, #0x18]\n" + "ldr d6, [x17, #0x20]\n" + "add x12, x12, #0x10\n" + "ldr x10, [x17, #0x28]\n" + "add x28, x28, #0x10\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "sub x13, x13, #0x4\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr d6, [x17, #0x40]\n" + "ldr x10, [x17, #0x48]\n" + "cmp x13, #0x8\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x58]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "ldr d6, [x17, #0x60]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x78]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr d6, [x17, #0x80]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x98]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "ldr d6, [x17, #0xa0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "ldr d6, [x17, #0xc0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "ldr d6, [x17, #0xe0]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr d6, [x17, #0x0]\n" + "mov v7.d[1], x11\n" + "ldr x10, [x17, #0x8]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "mov v6.d[1], x10\n" + "ldr d1, [x28, #0x0]\n" + "mov v0.d[1], x9\n" + "mov v1.d[1], x27\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "sub x13, x13, #0x4\n" + "add x12, x12, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x13, 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" + "sub x13, x13, #0x1\n" + "ldr s1, [x28], #0x4\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "cbnz x13, 54b\n" + "55:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 48b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 56f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "56:" // Height 2: No activation + "cmp x8, #0x10\n" + "bge 65f\n" + "tbz x8, #3, 60f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v9.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v13.4s }, [x25], #0x10\n" + "tbz x8, #2, 58f\n" + "st1 { v10.4s }, [x15], #0x10\n" + "st1 { v14.4s }, [x25], #0x10\n" + "tbz x8, #1, 57f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "tbz x8, #0, 64f\n" + "st1 { v11.s }[2], [x15]\n" + "st1 { v15.s }[2], [x25]\n" + "b 64f\n" + "57:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x8, #0, 64f\n" + "str s11, [x15, #0x0]\n" + "str s15, [x25, #0x0]\n" + "b 64f\n" + "58:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x8, #1, 59f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "tbz x8, #0, 64f\n" + "st1 { v10.s }[2], [x15]\n" + "st1 { v14.s }[2], [x25]\n" + "b 64f\n" + "59:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x8, #0, 64f\n" + "str s10, [x15, #0x0]\n" + "str s14, [x25, #0x0]\n" + "b 64f\n" + "60:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x8, #2, 62f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "tbz x8, #1, 61f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "tbz x8, #0, 64f\n" + "st1 { v9.s }[2], [x15]\n" + "st1 { v13.s }[2], [x25]\n" + "b 64f\n" + "61:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x8, #0, 64f\n" + "str s9, [x15, #0x0]\n" + "str s13, [x25, #0x0]\n" + "b 64f\n" + "62:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x8, #1, 63f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "tbz x8, #0, 64f\n" + "st1 { v8.s }[2], [x15]\n" + "st1 { v12.s }[2], [x25]\n" + "b 64f\n" + "63:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x15, #0x0]\n" + "str s12, [x25, #0x0]\n" + "64:" // Height 2: Partial direct writeback: Done + "b 66f\n" + "65:" // Height 2: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "66:" // Height 2: Writeback done + "subs x8, x8, #0x10\n" + "bgt 35b\n" + "b 200f\n" + "67:" // Height 3 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "68:" // Height 3: Column loop + "cbz x16, 69f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "b 80f\n" + "69:" // Height 3: no bias + "tbz %x[flags], #0, 79f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x10\n" + "add x25, x15, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "bge 78f\n" + "tbz x8, #3, 73f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v9.4s }, [x15], #0x10\n" + "ld1 { v13.4s }, [x25], #0x10\n" + "ld1 { v17.4s }, [x24], #0x10\n" + "tbz x8, #2, 71f\n" + "ld1 { v10.4s }, [x15], #0x10\n" + "ld1 { v14.4s }, [x25], #0x10\n" + "ld1 { v18.4s }, [x24], #0x10\n" + "tbz x8, #1, 70f\n" + "ldr d11, [x15], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "tbz x8, #0, 77f\n" + "ld1 { v11.s }[2], [x15]\n" + "ld1 { v15.s }[2], [x25]\n" + "ld1 { v19.s }[2], [x24]\n" + "b 77f\n" + "70:" // Height 3: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x8, #0, 77f\n" + "ldr s11, [x15, #0x0]\n" + "ldr s15, [x25, #0x0]\n" + "ldr s19, [x24, #0x0]\n" + "b 77f\n" + "71:" // Height 3: Partial accumulate: partial_2_8 + "tbz x8, #1, 72f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "mov x19, #0x28\n" + "ldr d18, [x24], #0x8\n" + "tbz x8, #0, 77f\n" + "ld1 { v10.s }[2], [x15]\n" + "ld1 { v14.s }[2], [x25]\n" + "ld1 { v18.s }[2], [x24]\n" + "b 77f\n" + "72:" // Height 3: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x8, #0, 77f\n" + "ldr s10, [x15, #0x0]\n" + "ldr s14, [x25, #0x0]\n" + "ldr s18, [x24, #0x0]\n" + "b 77f\n" + "73:" // Height 3: Partial accumulate: partial_4_0 + "tbz x8, #2, 75f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "tbz x8, #1, 74f\n" + "ldr d9, [x15], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "tbz x8, #0, 77f\n" + "ld1 { v9.s }[2], [x15]\n" + "ld1 { v13.s }[2], [x25]\n" + "ld1 { v17.s }[2], [x24]\n" + "b 77f\n" + "74:" // Height 3: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x8, #0, 77f\n" + "ldr s9, [x15, #0x0]\n" + "ldr s13, [x25, #0x0]\n" + "ldr s17, [x24, #0x0]\n" + "b 77f\n" + "75:" // Height 3: Partial accumulate: partial_2_0 + "tbz x8, #1, 76f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "mov x19, #0x8\n" + "ldr d16, [x24], #0x8\n" + "tbz x8, #0, 77f\n" + "ld1 { v8.s }[2], [x15]\n" + "ld1 { v12.s }[2], [x25]\n" + "ld1 { v16.s }[2], [x24]\n" + "b 77f\n" + "76:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x25, #0x0]\n" + "ldr s16, [x24, #0x0]\n" + "77:" // Height 3: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 80f\n" + "78:" // Height 3: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "b 80f\n" + "79:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "80:" // Height 3: setup done + "mov x14, #0x0\n" + "81:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 82f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "cbnz x14, 83f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 83f\n" + "82:" // Height 3: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "83:" // Height 3: input setup done + "cmp x13, #0x4\n" + "blt 86f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x8\n" + "ldr q2, [x26, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 85f\n" + "84:" // Height 3: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr d6, [x17, #0x20]\n" + "ldr x10, [x17, #0x28]\n" + "add x12, x12, #0x10\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "ldr x11, [x17, #0x38]\n" + "add x28, x28, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x40]\n" + "add x26, x26, #0x10\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x50]\n" + "sub x13, x13, #0x4\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x60]\n" + "cmp x13, #0x8\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0xf0]\n" + "add x17, x17, #0x100\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "mov v0.d[1], x9\n" + "ldr d2, [x26, #0x0]\n" + "mov v1.d[1], x27\n" + "mov v2.d[1], x25\n" + "bge 84b\n" + "85:" // Height 3: Multiply loop: Single iteration only + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x13, x13, #0x4\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "86:" // Height 3: Multiply loop: Main loop skip + "cbz x13, 88f\n" + "87:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" + "sub x13, x13, #0x1\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "cbnz x13, 87b\n" + "88:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 81b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 89f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "89:" // Height 3: No activation + "cmp x8, #0x10\n" + "bge 98f\n" + "tbz x8, #3, 93f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v9.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v13.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "tbz x8, #2, 91f\n" + "st1 { v10.4s }, [x15], #0x10\n" + "st1 { v14.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "tbz x8, #1, 90f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "tbz x8, #0, 97f\n" + "st1 { v11.s }[2], [x15]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "b 97f\n" + "90:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x8, #0, 97f\n" + "str s11, [x15, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "b 97f\n" + "91:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x8, #1, 92f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "tbz x8, #0, 97f\n" + "st1 { v10.s }[2], [x15]\n" + "st1 { v14.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "b 97f\n" + "92:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x8, #0, 97f\n" + "str s10, [x15, #0x0]\n" + "str s14, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "b 97f\n" + "93:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x8, #2, 95f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "tbz x8, #1, 94f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "tbz x8, #0, 97f\n" + "st1 { v9.s }[2], [x15]\n" + "st1 { v13.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "b 97f\n" + "94:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x8, #0, 97f\n" + "str s9, [x15, #0x0]\n" + "str s13, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "b 97f\n" + "95:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x8, #1, 96f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "tbz x8, #0, 97f\n" + "st1 { v8.s }[2], [x15]\n" + "st1 { v12.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "b 97f\n" + "96:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x15, #0x0]\n" + "str s12, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "97:" // Height 3: Partial direct writeback: Done + "b 99f\n" + "98:" // Height 3: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "99:" // Height 3: Writeback done + "subs x8, x8, #0x10\n" + "bgt 68b\n" + "b 200f\n" + "100:" // Height 4 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "101:" // Height 4: Column loop + "cbz x16, 102f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v11.16b\n" + "b 113f\n" + "102:" // Height 4: no bias + "tbz %x[flags], #0, 112f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x10\n" + "add x25, x15, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 111f\n" + "tbz x8, #3, 106f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v9.4s }, [x15], #0x10\n" + "ld1 { v13.4s }, [x25], #0x10\n" + "ld1 { v17.4s }, [x24], #0x10\n" + "ld1 { v20.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x23], #0x10\n" + "tbz x8, #2, 104f\n" + "ld1 { v10.4s }, [x15], #0x10\n" + "ld1 { v14.4s }, [x25], #0x10\n" + "ld1 { v18.4s }, [x24], #0x10\n" + "ld1 { v22.4s }, [x23], #0x10\n" + "tbz x8, #1, 103f\n" + "ldr d11, [x15], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "tbz x8, #0, 110f\n" + "ld1 { v11.s }[2], [x15]\n" + "ld1 { v15.s }[2], [x25]\n" + "ld1 { v19.s }[2], [x24]\n" + "ld1 { v23.s }[2], [x23]\n" + "b 110f\n" + "103:" // Height 4: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x8, #0, 110f\n" + "ldr s11, [x15, #0x0]\n" + "ldr s15, [x25, #0x0]\n" + "ldr s19, [x24, #0x0]\n" + "ldr s23, [x23, #0x0]\n" + "b 110f\n" + "104:" // Height 4: Partial accumulate: partial_2_8 + "tbz x8, #1, 105f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "mov x19, #0x28\n" + "ldr d18, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "tbz x8, #0, 110f\n" + "ld1 { v10.s }[2], [x15]\n" + "ld1 { v14.s }[2], [x25]\n" + "ld1 { v18.s }[2], [x24]\n" + "ld1 { v22.s }[2], [x23]\n" + "b 110f\n" + "105:" // Height 4: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x8, #0, 110f\n" + "ldr s10, [x15, #0x0]\n" + "ldr s14, [x25, #0x0]\n" + "ldr s18, [x24, #0x0]\n" + "ldr s22, [x23, #0x0]\n" + "b 110f\n" + "106:" // Height 4: Partial accumulate: partial_4_0 + "tbz x8, #2, 108f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v20.4s }, [x23], #0x10\n" + "tbz x8, #1, 107f\n" + "ldr d9, [x15], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "tbz x8, #0, 110f\n" + "ld1 { v9.s }[2], [x15]\n" + "ld1 { v13.s }[2], [x25]\n" + "ld1 { v17.s }[2], [x24]\n" + "ld1 { v21.s }[2], [x23]\n" + "b 110f\n" + "107:" // Height 4: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x8, #0, 110f\n" + "ldr s9, [x15, #0x0]\n" + "ldr s13, [x25, #0x0]\n" + "ldr s17, [x24, #0x0]\n" + "ldr s21, [x23, #0x0]\n" + "b 110f\n" + "108:" // Height 4: Partial accumulate: partial_2_0 + "tbz x8, #1, 109f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "mov x19, #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "tbz x8, #0, 110f\n" + "ld1 { v8.s }[2], [x15]\n" + "ld1 { v12.s }[2], [x25]\n" + "ld1 { v16.s }[2], [x24]\n" + "ld1 { v20.s }[2], [x23]\n" + "b 110f\n" + "109:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x25, #0x0]\n" + "ldr s16, [x24, #0x0]\n" + "ldr s20, [x23, #0x0]\n" + "110:" // Height 4: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 113f\n" + "111:" // Height 4: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "b 113f\n" + "112:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "113:" // Height 4: setup done + "mov x14, #0x0\n" + "114:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 115f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x14, 116f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 116f\n" + "115:" // Height 4: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "116:" // Height 4: input setup done + "cmp x13, #0x4\n" + "blt 119f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x8\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 118f\n" + "117:" // Height 4: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr x10, [x17, #0x28]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "ldr d6, [x17, #0x20]\n" + "mov v7.d[1], x11\n" + "ldr x11, [x17, #0x38]\n" + "add x12, x12, #0x10\n" + "add x28, x28, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "ldr d6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "add x26, x26, #0x10\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr d6, [x17, #0x60]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x70]\n" + "add x24, x24, #0x10\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "sub x13, x13, #0x4\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "cmp x13, #0x8\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "mov v0.d[1], x9\n" + "mov v1.d[1], x27\n" + "ldr d2, [x26, #0x0]\n" + "ldr d3, [x24, #0x0]\n" + "mov v2.d[1], x25\n" + "mov v3.d[1], x23\n" + "bge 117b\n" + "118:" // Height 4: Multiply loop: Single iteration only + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x13, x13, #0x4\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "119:" // Height 4: Multiply loop: Main loop skip + "cbz x13, 121f\n" + "120:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" + "sub x13, x13, #0x1\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "cbnz x13, 120b\n" + "121:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 114b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 122f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "122:" // Height 4: No activation + "cmp x8, #0x10\n" + "bge 131f\n" + "tbz x8, #3, 126f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v9.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v13.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "tbz x8, #2, 124f\n" + "st1 { v10.4s }, [x15], #0x10\n" + "st1 { v14.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "st1 { v22.4s }, [x23], #0x10\n" + "tbz x8, #1, 123f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "tbz x8, #0, 130f\n" + "st1 { v11.s }[2], [x15]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "st1 { v23.s }[2], [x23]\n" + "b 130f\n" + "123:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x8, #0, 130f\n" + "str s11, [x15, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "str s23, [x23, #0x0]\n" + "b 130f\n" + "124:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x8, #1, 125f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "tbz x8, #0, 130f\n" + "st1 { v10.s }[2], [x15]\n" + "st1 { v14.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "b 130f\n" + "125:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x8, #0, 130f\n" + "str s10, [x15, #0x0]\n" + "str s14, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "b 130f\n" + "126:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x8, #2, 128f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "tbz x8, #1, 127f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "tbz x8, #0, 130f\n" + "st1 { v9.s }[2], [x15]\n" + "st1 { v13.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "b 130f\n" + "127:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x8, #0, 130f\n" + "str s9, [x15, #0x0]\n" + "str s13, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "b 130f\n" + "128:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x8, #1, 129f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "tbz x8, #0, 130f\n" + "st1 { v8.s }[2], [x15]\n" + "st1 { v12.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "b 130f\n" + "129:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x15, #0x0]\n" + "str s12, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "130:" // Height 4: Partial direct writeback: Done + "b 132f\n" + "131:" // Height 4: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q20, [x23, #0x0]\n" + "str q21, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q23, [x23, #0x30]\n" + "132:" // Height 4: Writeback done + "subs x8, x8, #0x10\n" + "bgt 101b\n" + "b 200f\n" + "133:" // Height 5 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "134:" // Height 5: Column loop + "cbz x16, 135f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v24.16b, v8.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v11.16b\n" + "mov v27.16b, v11.16b\n" + "b 146f\n" + "135:" // Height 5: no bias + "tbz %x[flags], #0, 145f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x10\n" + "add x25, x15, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 144f\n" + "tbz x8, #3, 139f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v9.4s }, [x15], #0x10\n" + "ld1 { v13.4s }, [x25], #0x10\n" + "ld1 { v17.4s }, [x24], #0x10\n" + "ld1 { v20.4s }, [x23], #0x10\n" + "ld1 { v24.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x23], #0x10\n" + "ld1 { v25.4s }, [x22], #0x10\n" + "tbz x8, #2, 137f\n" + "ld1 { v10.4s }, [x15], #0x10\n" + "ld1 { v14.4s }, [x25], #0x10\n" + "ld1 { v18.4s }, [x24], #0x10\n" + "ld1 { v22.4s }, [x23], #0x10\n" + "ld1 { v26.4s }, [x22], #0x10\n" + "tbz x8, #1, 136f\n" + "ldr d11, [x15], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "tbz x8, #0, 143f\n" + "ld1 { v11.s }[2], [x15]\n" + "ld1 { v15.s }[2], [x25]\n" + "ld1 { v19.s }[2], [x24]\n" + "ld1 { v23.s }[2], [x23]\n" + "ld1 { v27.s }[2], [x22]\n" + "b 143f\n" + "136:" // Height 5: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x8, #0, 143f\n" + "ldr s11, [x15, #0x0]\n" + "ldr s15, [x25, #0x0]\n" + "ldr s19, [x24, #0x0]\n" + "ldr s23, [x23, #0x0]\n" + "ldr s27, [x22, #0x0]\n" + "b 143f\n" + "137:" // Height 5: Partial accumulate: partial_2_8 + "tbz x8, #1, 138f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "mov x19, #0x28\n" + "ldr d18, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" + "tbz x8, #0, 143f\n" + "ld1 { v10.s }[2], [x15]\n" + "ld1 { v14.s }[2], [x25]\n" + "ld1 { v18.s }[2], [x24]\n" + "ld1 { v22.s }[2], [x23]\n" + "ld1 { v26.s }[2], [x22]\n" + "b 143f\n" + "138:" // Height 5: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x8, #0, 143f\n" + "ldr s10, [x15, #0x0]\n" + "ldr s14, [x25, #0x0]\n" + "ldr s18, [x24, #0x0]\n" + "ldr s22, [x23, #0x0]\n" + "ldr s26, [x22, #0x0]\n" + "b 143f\n" + "139:" // Height 5: Partial accumulate: partial_4_0 + "tbz x8, #2, 141f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v20.4s }, [x23], #0x10\n" + "ld1 { v24.4s }, [x22], #0x10\n" + "tbz x8, #1, 140f\n" + "ldr d9, [x15], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "tbz x8, #0, 143f\n" + "ld1 { v9.s }[2], [x15]\n" + "ld1 { v13.s }[2], [x25]\n" + "ld1 { v17.s }[2], [x24]\n" + "ld1 { v21.s }[2], [x23]\n" + "ld1 { v25.s }[2], [x22]\n" + "b 143f\n" + "140:" // Height 5: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x8, #0, 143f\n" + "ldr s9, [x15, #0x0]\n" + "ldr s13, [x25, #0x0]\n" + "ldr s17, [x24, #0x0]\n" + "ldr s21, [x23, #0x0]\n" + "ldr s25, [x22, #0x0]\n" + "b 143f\n" + "141:" // Height 5: Partial accumulate: partial_2_0 + "tbz x8, #1, 142f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "mov x19, #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "tbz x8, #0, 143f\n" + "ld1 { v8.s }[2], [x15]\n" + "ld1 { v12.s }[2], [x25]\n" + "ld1 { v16.s }[2], [x24]\n" + "ld1 { v20.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "b 143f\n" + "142:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x25, #0x0]\n" + "ldr s16, [x24, #0x0]\n" + "ldr s20, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "143:" // Height 5: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 146f\n" + "144:" // Height 5: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "b 146f\n" + "145:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "146:" // Height 5: setup done + "mov x14, #0x0\n" + "147:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 148f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x14, 149f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 149f\n" + "148:" // Height 5: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "149:" // Height 5: input setup done + "cmp x13, #0x4\n" + "blt 152f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x8\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 151f\n" + "150:" // Height 5: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr x10, [x17, #0x28]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "mov v7.d[1], x11\n" + "prfm pldl1keep, [x12, #0x80]\n" + "add x28, x28, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "ldr d6, [x17, #0x20]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "mov v6.d[1], x10\n" + "ldr d7, [x17, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "ldr d6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "add x24, x24, #0x10\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "ldr d6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "add x22, x22, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "ldr x21, [x22, #0x8]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "sub x13, x13, #0x4\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "cmp x13, #0x8\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "mov v0.d[1], x9\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "mov v1.d[1], x27\n" + "ldr d2, [x26, #0x0]\n" + "ldr d3, [x24, #0x0]\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x25\n" + "mov v3.d[1], x23\n" + "mov v4.d[1], x21\n" + "bge 150b\n" + "151:" // Height 5: Multiply loop: Single iteration only + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x13, x13, #0x4\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "add x22, x22, #0x10\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "152:" // Height 5: Multiply loop: Main loop skip + "cbz x13, 154f\n" + "153:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" + "sub x13, x13, #0x1\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "cbnz x13, 153b\n" + "154:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 147b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 155f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "155:" // Height 5: No activation + "cmp x8, #0x10\n" + "bge 164f\n" + "tbz x8, #3, 159f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v9.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v13.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "st1 { v25.4s }, [x22], #0x10\n" + "tbz x8, #2, 157f\n" + "st1 { v10.4s }, [x15], #0x10\n" + "st1 { v14.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "st1 { v22.4s }, [x23], #0x10\n" + "st1 { v26.4s }, [x22], #0x10\n" + "tbz x8, #1, 156f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "str d27, [x22], #0x8\n" + "tbz x8, #0, 163f\n" + "st1 { v11.s }[2], [x15]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "st1 { v23.s }[2], [x23]\n" + "st1 { v27.s }[2], [x22]\n" + "b 163f\n" + "156:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x8, #0, 163f\n" + "str s11, [x15, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "str s23, [x23, #0x0]\n" + "str s27, [x22, #0x0]\n" + "b 163f\n" + "157:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x8, #1, 158f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d26, [x22], #0x8\n" + "tbz x8, #0, 163f\n" + "st1 { v10.s }[2], [x15]\n" + "st1 { v14.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v26.s }[2], [x22]\n" + "b 163f\n" + "158:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x8, #0, 163f\n" + "str s10, [x15, #0x0]\n" + "str s14, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s26, [x22, #0x0]\n" + "b 163f\n" + "159:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x8, #2, 161f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "tbz x8, #1, 160f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "tbz x8, #0, 163f\n" + "st1 { v9.s }[2], [x15]\n" + "st1 { v13.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v25.s }[2], [x22]\n" + "b 163f\n" + "160:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x8, #0, 163f\n" + "str s9, [x15, #0x0]\n" + "str s13, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s25, [x22, #0x0]\n" + "b 163f\n" + "161:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x8, #1, 162f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "tbz x8, #0, 163f\n" + "st1 { v8.s }[2], [x15]\n" + "st1 { v12.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v24.s }[2], [x22]\n" + "b 163f\n" + "162:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x15, #0x0]\n" + "str s12, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "163:" // Height 5: Partial direct writeback: Done + "b 165f\n" + "164:" // Height 5: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q20, [x23, #0x0]\n" + "str q21, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q23, [x23, #0x30]\n" + "str q24, [x22, #0x0]\n" + "str q25, [x22, #0x10]\n" + "str q26, [x22, #0x20]\n" + "str q27, [x22, #0x30]\n" + "165:" // Height 5: Writeback done + "subs x8, x8, #0x10\n" + "bgt 134b\n" + "b 200f\n" + "166:" // Height 6 + "ldr x8, [%x[args_ptr], %[offsetof_N]]\n" + "mov x16, %x[bias]\n" + "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "167:" // Height 6: Column loop + "cbz x16, 168f\n" + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "mov v12.16b, v8.16b\n" + "mov v16.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v18.16b, v10.16b\n" + "mov v20.16b, v8.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v24.16b, v8.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v28.16b, v8.16b\n" + "mov v29.16b, v9.16b\n" + "mov v30.16b, v10.16b\n" + "ldr q11, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "mov v15.16b, v11.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v11.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v11.16b\n" + "b 179f\n" + "168:" // Height 6: no bias + "tbz %x[flags], #0, 178f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x8, #0x10\n" + "add x25, x15, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 177f\n" + "tbz x8, #3, 172f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v9.4s }, [x15], #0x10\n" + "ld1 { v13.4s }, [x25], #0x10\n" + "ld1 { v17.4s }, [x24], #0x10\n" + "ld1 { v20.4s }, [x23], #0x10\n" + "ld1 { v24.4s }, [x22], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "ld1 { v21.4s }, [x23], #0x10\n" + "ld1 { v25.4s }, [x22], #0x10\n" + "ld1 { v29.4s }, [x21], #0x10\n" + "tbz x8, #2, 170f\n" + "ld1 { v10.4s }, [x15], #0x10\n" + "ld1 { v14.4s }, [x25], #0x10\n" + "ld1 { v18.4s }, [x24], #0x10\n" + "ld1 { v22.4s }, [x23], #0x10\n" + "ld1 { v26.4s }, [x22], #0x10\n" + "ld1 { v30.4s }, [x21], #0x10\n" + "tbz x8, #1, 169f\n" + "ldr d11, [x15], #0x8\n" + "mov x19, #0x38\n" + "ldr d15, [x25], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x8, #0, 176f\n" + "ld1 { v11.s }[2], [x15]\n" + "ld1 { v15.s }[2], [x25]\n" + "ld1 { v19.s }[2], [x24]\n" + "ld1 { v23.s }[2], [x23]\n" + "ld1 { v27.s }[2], [x22]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 176f\n" + "169:" // Height 6: Partial accumulate: partial_1_12 + "mov x19, #0x30\n" + "tbz x8, #0, 176f\n" + "ldr s11, [x15, #0x0]\n" + "ldr s15, [x25, #0x0]\n" + "ldr s19, [x24, #0x0]\n" + "ldr s23, [x23, #0x0]\n" + "ldr s27, [x22, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "b 176f\n" + "170:" // Height 6: Partial accumulate: partial_2_8 + "tbz x8, #1, 171f\n" + "ldr d10, [x15], #0x8\n" + "ldr d14, [x25], #0x8\n" + "mov x19, #0x28\n" + "ldr d18, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d26, [x22], #0x8\n" + "ldr d30, [x21], #0x8\n" + "tbz x8, #0, 176f\n" + "ld1 { v10.s }[2], [x15]\n" + "ld1 { v14.s }[2], [x25]\n" + "ld1 { v18.s }[2], [x24]\n" + "ld1 { v22.s }[2], [x23]\n" + "ld1 { v26.s }[2], [x22]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 176f\n" + "171:" // Height 6: Partial accumulate: partial_1_8 + "mov x19, #0x20\n" + "tbz x8, #0, 176f\n" + "ldr s10, [x15, #0x0]\n" + "ldr s14, [x25, #0x0]\n" + "ldr s18, [x24, #0x0]\n" + "ldr s22, [x23, #0x0]\n" + "ldr s26, [x22, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "b 176f\n" + "172:" // Height 6: Partial accumulate: partial_4_0 + "tbz x8, #2, 174f\n" + "ld1 { v8.4s }, [x15], #0x10\n" + "ld1 { v12.4s }, [x25], #0x10\n" + "ld1 { v16.4s }, [x24], #0x10\n" + "ld1 { v20.4s }, [x23], #0x10\n" + "ld1 { v24.4s }, [x22], #0x10\n" + "ld1 { v28.4s }, [x21], #0x10\n" + "tbz x8, #1, 173f\n" + "ldr d9, [x15], #0x8\n" + "mov x19, #0x18\n" + "ldr d13, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "ldr d29, [x21], #0x8\n" + "tbz x8, #0, 176f\n" + "ld1 { v9.s }[2], [x15]\n" + "ld1 { v13.s }[2], [x25]\n" + "ld1 { v17.s }[2], [x24]\n" + "ld1 { v21.s }[2], [x23]\n" + "ld1 { v25.s }[2], [x22]\n" + "ld1 { v29.s }[2], [x21]\n" + "b 176f\n" + "173:" // Height 6: Partial accumulate: partial_1_4 + "mov x19, #0x10\n" + "tbz x8, #0, 176f\n" + "ldr s9, [x15, #0x0]\n" + "ldr s13, [x25, #0x0]\n" + "ldr s17, [x24, #0x0]\n" + "ldr s21, [x23, #0x0]\n" + "ldr s25, [x22, #0x0]\n" + "ldr s29, [x21, #0x0]\n" + "b 176f\n" + "174:" // Height 6: Partial accumulate: partial_2_0 + "tbz x8, #1, 175f\n" + "ldr d8, [x15], #0x8\n" + "ldr d12, [x25], #0x8\n" + "mov x19, #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d28, [x21], #0x8\n" + "tbz x8, #0, 176f\n" + "ld1 { v8.s }[2], [x15]\n" + "ld1 { v12.s }[2], [x25]\n" + "ld1 { v16.s }[2], [x24]\n" + "ld1 { v20.s }[2], [x23]\n" + "ld1 { v24.s }[2], [x22]\n" + "ld1 { v28.s }[2], [x21]\n" + "b 176f\n" + "175:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x15, #0x0]\n" + "mov x19, #0x0\n" + "ldr s12, [x25, #0x0]\n" + "ldr s16, [x24, #0x0]\n" + "ldr s20, [x23, #0x0]\n" + "ldr s24, [x22, #0x0]\n" + "ldr s28, [x21, #0x0]\n" + "176:" // Height 6: Partial accumulate: Done + "sub x15, x15, x19\n" + "b 179f\n" + "177:" // Height 6: full accumulate + "ldr q8, [x15, #0x0]\n" + "ldr q9, [x15, #0x10]\n" + "ldr q10, [x15, #0x20]\n" + "ldr q11, [x15, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "ldr q28, [x21, #0x0]\n" + "ldr q29, [x21, #0x10]\n" + "ldr q30, [x21, #0x20]\n" + "ldr q31, [x21, #0x30]\n" + "b 179f\n" + "178:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "179:" // Height 6: setup done + "mov x14, #0x0\n" + "180:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "tbz %x[flags], #3, 181f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x14, 182f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x19, LSL #2\n" + "add x28, x28, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 182f\n" + "181:" // Height 6: setup direct input + "mov x12, %x[input_ptr]\n" + "add x28, x12, x19, LSL #2\n" + "add x26, x28, x19, LSL #2\n" + "add x24, x26, x19, LSL #2\n" + "add x22, x24, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "182:" // Height 6: input setup done + "cmp x13, #0x4\n" + "blt 185f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q1, [x28, #0x0]\n" + "cmp x13, #0x8\n" + "ldr q2, [x26, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x17, #0x0]\n" + "blt 184f\n" + "183:" // Height 6: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr d7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "ldr x11, [x17, #0x18]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "ldr x10, [x17, #0x28]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "mov v7.d[1], x11\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr d6, [x17, #0x20]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "ldr x11, [x17, #0x38]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "ldr x9, [x12, #0x8]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr x10, [x17, #0x48]\n" + "fmla v29.4s, v7.4s, v5.s[0]\n" + "ldr d7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "ldr x11, [x17, #0x58]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "ldr x27, [x28, #0x8]\n" + "fmla v30.4s, v6.4s, v5.s[0]\n" + "ldr d6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "ldr x10, [x17, #0x68]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "ldr x25, [x26, #0x8]\n" + "fmla v31.4s, v7.4s, v5.s[0]\n" + "ldr d7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "add x24, x24, #0x10\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "ldr x11, [x17, #0x78]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "ldr x23, [x24, #0x8]\n" + "fmla v28.4s, v6.4s, v5.s[1]\n" + "ldr d6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "add x22, x22, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "ldr x10, [x17, #0x88]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "ldr x21, [x22, #0x8]\n" + "fmla v29.4s, v7.4s, v5.s[1]\n" + "ldr d7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "add x20, x20, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "ldr x11, [x17, #0x98]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "ldr x19, [x20, #0x8]\n" + "fmla v30.4s, v6.4s, v5.s[1]\n" + "ldr d6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "sub x13, x13, #0x4\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "cmp x13, #0x8\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "ldr x10, [x17, #0xa8]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "fmla v31.4s, v7.4s, v5.s[1]\n" + "ldr d7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "ldr x11, [x17, #0xb8]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "fmla v28.4s, v6.4s, v5.s[2]\n" + "ldr d6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "ldr x10, [x17, #0xc8]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "fmla v29.4s, v7.4s, v5.s[2]\n" + "ldr d7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "ldr x11, [x17, #0xd8]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "ldr d6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "ldr x10, [x17, #0xe8]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "fmla v31.4s, v7.4s, v5.s[2]\n" + "ldr d7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "ldr x11, [x17, #0xf8]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "fmla v28.4s, v6.4s, v5.s[3]\n" + "ldr d6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "fmla v29.4s, v7.4s, v5.s[3]\n" + "ldr d7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr x10, [x17, #0x8]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "mov v7.d[1], x11\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v30.4s, v6.4s, v5.s[3]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr d0, [x12, #0x0]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr d1, [x28, #0x0]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "mov v6.d[1], x10\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "mov v0.d[1], x9\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "mov v1.d[1], x27\n" + "fmla v31.4s, v7.4s, v5.s[3]\n" + "ldr d2, [x26, #0x0]\n" + "ldr d3, [x24, #0x0]\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x25\n" + "ldr d5, [x20, #0x0]\n" + "mov v3.d[1], x23\n" + "mov v4.d[1], x21\n" + "mov v5.d[1], x19\n" + "bge 183b\n" + "184:" // Height 6: Multiply loop: Single iteration only + "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x13, x13, #0x4\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "add x12, x12, #0x10\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "add x28, x28, #0x10\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x26, x26, #0x10\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v29.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x22, x22, #0x10\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "add x20, x20, #0x10\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v30.4s, v6.4s, v5.s[0]\n" + "ldr q6, [x17, #0x40]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v31.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x17, #0x50]\n" + "fmla v8.4s, v6.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v1.s[1]\n" + "fmla v16.4s, v6.4s, v2.s[1]\n" + "fmla v20.4s, v6.4s, v3.s[1]\n" + "fmla v24.4s, v6.4s, v4.s[1]\n" + "fmla v28.4s, v6.4s, v5.s[1]\n" + "ldr q6, [x17, #0x60]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "fmla v29.4s, v7.4s, v5.s[1]\n" + "ldr q7, [x17, #0x70]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v14.4s, v6.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[1]\n" + "fmla v30.4s, v6.4s, v5.s[1]\n" + "ldr q6, [x17, #0x80]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v15.4s, v7.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v2.s[1]\n" + "fmla v23.4s, v7.4s, v3.s[1]\n" + "fmla v27.4s, v7.4s, v4.s[1]\n" + "fmla v31.4s, v7.4s, v5.s[1]\n" + "ldr q7, [x17, #0x90]\n" + "fmla v8.4s, v6.4s, v0.s[2]\n" + "fmla v12.4s, v6.4s, v1.s[2]\n" + "fmla v16.4s, v6.4s, v2.s[2]\n" + "fmla v20.4s, v6.4s, v3.s[2]\n" + "fmla v24.4s, v6.4s, v4.s[2]\n" + "fmla v28.4s, v6.4s, v5.s[2]\n" + "ldr q6, [x17, #0xa0]\n" + "fmla v9.4s, v7.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v1.s[2]\n" + "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v21.4s, v7.4s, v3.s[2]\n" + "fmla v25.4s, v7.4s, v4.s[2]\n" + "fmla v29.4s, v7.4s, v5.s[2]\n" + "ldr q7, [x17, #0xb0]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "ldr q6, [x17, #0xc0]\n" + "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[2]\n" + "fmla v31.4s, v7.4s, v5.s[2]\n" + "ldr q7, [x17, #0xd0]\n" + "fmla v8.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[3]\n" + "fmla v20.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[3]\n" + "fmla v28.4s, v6.4s, v5.s[3]\n" + "ldr q6, [x17, #0xe0]\n" + "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v2.s[3]\n" + "fmla v21.4s, v7.4s, v3.s[3]\n" + "fmla v25.4s, v7.4s, v4.s[3]\n" + "fmla v29.4s, v7.4s, v5.s[3]\n" + "ldr q7, [x17, #0xf0]\n" + "fmla v10.4s, v6.4s, v0.s[3]\n" + "add x17, x17, #0x100\n" + "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v30.4s, v6.4s, v5.s[3]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v31.4s, v7.4s, v5.s[3]\n" + "185:" // Height 6: Multiply loop: Main loop skip + "cbz x13, 187f\n" + "186:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x12], #0x4\n" + "sub x13, x13, #0x1\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x26], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x17, #0x0]\n" + "ldr q7, [x17, #0x10]\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "ldr q6, [x17, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "fmla v29.4s, v7.4s, v5.s[0]\n" + "ldr q7, [x17, #0x30]\n" + "fmla v10.4s, v6.4s, v0.s[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v2.s[0]\n" + "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v26.4s, v6.4s, v4.s[0]\n" + "fmla v30.4s, v6.4s, v5.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v31.4s, v7.4s, v5.s[0]\n" + "cbnz x13, 186b\n" + "187:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x14, x14, #0x1\n" + "cmp x14, x19\n" + "bne 180b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x15, #0x0]\n" + "add x25, x15, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 188f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v1.4s }, [x20]\n" + "ld1r { v0.4s }, [x19]\n" + "fmin v8.4s, v8.4s, v0.4s\n" + "fmin v9.4s, v9.4s, v0.4s\n" + "fmin v10.4s, v10.4s, v0.4s\n" + "fmin v11.4s, v11.4s, v0.4s\n" + "fmin v12.4s, v12.4s, v0.4s\n" + "fmin v13.4s, v13.4s, v0.4s\n" + "fmin v14.4s, v14.4s, v0.4s\n" + "fmin v15.4s, v15.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "188:" // Height 6: No activation + "cmp x8, #0x10\n" + "bge 197f\n" + "tbz x8, #3, 192f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v9.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v13.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v21.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "st1 { v25.4s }, [x22], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "st1 { v29.4s }, [x21], #0x10\n" + "tbz x8, #2, 190f\n" + "st1 { v10.4s }, [x15], #0x10\n" + "st1 { v14.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "st1 { v22.4s }, [x23], #0x10\n" + "st1 { v26.4s }, [x22], #0x10\n" + "st1 { v30.4s }, [x21], #0x10\n" + "tbz x8, #1, 189f\n" + "str d11, [x15], #0x8\n" + "str d15, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "str d27, [x22], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x8, #0, 196f\n" + "st1 { v11.s }[2], [x15]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "st1 { v23.s }[2], [x23]\n" + "st1 { v27.s }[2], [x22]\n" + "st1 { v31.s }[2], [x21]\n" + "b 196f\n" + "189:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x8, #0, 196f\n" + "str s11, [x15, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "str s23, [x23, #0x0]\n" + "str s27, [x22, #0x0]\n" + "str s31, [x21, #0x0]\n" + "b 196f\n" + "190:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x8, #1, 191f\n" + "str d10, [x15], #0x8\n" + "str d14, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d22, [x23], #0x8\n" + "str d26, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x8, #0, 196f\n" + "st1 { v10.s }[2], [x15]\n" + "st1 { v14.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v26.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "b 196f\n" + "191:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x8, #0, 196f\n" + "str s10, [x15, #0x0]\n" + "str s14, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "str s22, [x23, #0x0]\n" + "str s26, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "b 196f\n" + "192:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x8, #2, 194f\n" + "st1 { v8.4s }, [x15], #0x10\n" + "st1 { v12.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v20.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "st1 { v28.4s }, [x21], #0x10\n" + "tbz x8, #1, 193f\n" + "str d9, [x15], #0x8\n" + "str d13, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d21, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "str d29, [x21], #0x8\n" + "tbz x8, #0, 196f\n" + "st1 { v9.s }[2], [x15]\n" + "st1 { v13.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "st1 { v21.s }[2], [x23]\n" + "st1 { v25.s }[2], [x22]\n" + "st1 { v29.s }[2], [x21]\n" + "b 196f\n" + "193:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x8, #0, 196f\n" + "str s9, [x15, #0x0]\n" + "str s13, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "str s21, [x23, #0x0]\n" + "str s25, [x22, #0x0]\n" + "str s29, [x21, #0x0]\n" + "b 196f\n" + "194:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x8, #1, 195f\n" + "str d8, [x15], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x8, #0, 196f\n" + "st1 { v8.s }[2], [x15]\n" + "st1 { v12.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "st1 { v20.s }[2], [x23]\n" + "st1 { v24.s }[2], [x22]\n" + "st1 { v28.s }[2], [x21]\n" + "b 196f\n" + "195:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x15, #0x0]\n" + "str s12, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "str s20, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "str s28, [x21, #0x0]\n" + "196:" // Height 6: Partial direct writeback: Done + "b 198f\n" + "197:" // Height 6: Full writeback + "str q8, [x15, #0x0]\n" + "str q9, [x15, #0x10]\n" + "str q10, [x15, #0x20]\n" + "str q11, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "str q12, [x25, #0x0]\n" + "str q13, [x25, #0x10]\n" + "str q14, [x25, #0x20]\n" + "str q15, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q20, [x23, #0x0]\n" + "str q21, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q23, [x23, #0x30]\n" + "str q24, [x22, #0x0]\n" + "str q25, [x22, #0x10]\n" + "str q26, [x22, #0x20]\n" + "str q27, [x22, #0x30]\n" + "str q28, [x21, #0x0]\n" + "str q29, [x21, #0x10]\n" + "str q30, [x21, #0x20]\n" + "str q31, [x21, #0x30]\n" + "198:" // Height 6: Writeback done + "subs x8, x8, #0x10\n" + "bgt 167b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 200f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 199f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "199:" // Update direct input + "mov x19, #0x18\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "200:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp index a645954edd..f5504b44d4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp @@ -95,222 +95,216 @@ void a64_hybrid_fp32_mla_6x16 ( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 171f\n" + "bge 166f\n" "cmp %x[M], #0x4\n" - "bgt 137f\n" - "beq 103f\n" + "bgt 133f\n" + "beq 100f\n" "cmp %x[M], #0x2\n" - "bgt 69f\n" - "beq 35f\n" - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop - "cbz x14, 4f\n" - "ldr q8, [x14, #0x0]\n" - "ldr q9, [x14, #0x10]\n" - "ldr q10, [x14, #0x20]\n" - "ldr q11, [x14, #0x30]\n" - "add x14, x14, #0x40\n" - "b 15f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 14f\n" - "cmp x16, #0x10\n" - "bge 13f\n" - "tbz x16, #3, 8f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "tbz x16, #2, 6f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "tbz x16, #1, 5f\n" + "bgt 67f\n" + "beq 34f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x9, 3f\n" + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "b 14f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 13f\n" + "cmp x11, #0x10\n" + "bge 12f\n" + "tbz x11, #3, 7f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #2, 5f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #1, 4f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "tbz x16, #0, 12f\n" - "ld1 { v11.s }[2], [x13]\n" - "b 12f\n" - "5:" // Height 1: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 12f\n" - "ldr s11, [x13, #0x0]\n" - "b 12f\n" - "6:" // Height 1: Partial accumulate: partial_2_8 - "tbz x16, #1, 7f\n" - "ldr d10, [x13], #0x8\n" + "tbz x11, #0, 11f\n" + "ldr s11, [x28, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 6f\n" + "ldr d10, [x28], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 12f\n" - "ld1 { v10.s }[2], [x13]\n" - "b 12f\n" - "7:" // Height 1: Partial accumulate: partial_1_8 + "tbz x11, #0, 11f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 12f\n" - "ldr s10, [x13, #0x0]\n" - "b 12f\n" - "8:" // Height 1: Partial accumulate: partial_4_0 - "tbz x16, #2, 10f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "tbz x16, #1, 9f\n" + "tbz x11, #0, 11f\n" + "ldr s10, [x28, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 9f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "tbz x11, #1, 8f\n" + "ldr d9, [x28], #0x8\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "tbz x16, #0, 12f\n" - "ld1 { v9.s }[2], [x13]\n" - "b 12f\n" - "9:" // Height 1: Partial accumulate: partial_1_4 + "tbz x11, #0, 11f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 12f\n" - "ldr s9, [x13, #0x0]\n" - "b 12f\n" - "10:" // Height 1: Partial accumulate: partial_2_0 - "tbz x16, #1, 11f\n" - "ldr d8, [x13], #0x8\n" + "tbz x11, #0, 11f\n" + "ldr s9, [x28, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 10f\n" + "ldr d8, [x28], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 12f\n" - "ld1 { v8.s }[2], [x13]\n" - "b 12f\n" - "11:" // Height 1: Partial accumulate: partial_1_0 + "tbz x11, #0, 11f\n" + "ld1 { v8.s }[2], [x28]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "12:" // Height 1: Partial accumulate: Done - "sub x13, x13, x19\n" - "b 15f\n" - "13:" // Height 1: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "b 15f\n" - "14:" // Height 1: no accumulate + "11:" // Height 1: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 14f\n" + "12:" // Height 1: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "b 14f\n" + "13:" // Height 1: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" "movi v11.16b, #0x0\n" - "15:" // Height 1: setup done - "mov x12, #0x0\n" - "16:" // Height 1: String loop + "14:" // Height 1: setup done + "mov x27, #0x0\n" + "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 17f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 18f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 17f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "b 18f\n" - "17:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "18:" // Height 1: input setup done - "cmp x11, #0x4\n" - "blt 21f\n" - "cmp x11, #0x8\n" + "add x25, x25, x19, LSL #2\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x26, #0x4\n" "blt 20f\n" - "19:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "cmp x26, #0x8\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x4\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x15, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "cmp x26, #0x8\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q6, [x10, #0x40]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x15, #0x60]\n" - "sub x11, x11, #0x4\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x15, #0x70]\n" - "cmp x11, #0x8\n" + "ldr q7, [x10, #0x70]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" - "bge 19b\n" - "20:" // Height 1: Multiply loop: Single iteration only - "sub x11, x11, #0x4\n" - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x4\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x15, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" - "21:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 23f\n" - "22:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x1\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "sub x11, x11, #0x1\n" - "add x15, x15, #0x40\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "cbnz x11, 22b\n" - "23:" // Height 1: Multiply loop: No odd multiplies + "cbnz x26, 21b\n" + "22:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 16b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "tbz %x[flags], #1, 24f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 15b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "tbz %x[flags], #1, 23f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -323,178 +317,170 @@ void a64_hybrid_fp32_mla_6x16 ( "fmax v9.4s, v9.4s, v1.4s\n" "fmax v10.4s, v10.4s, v1.4s\n" "fmax v11.4s, v11.4s, v1.4s\n" - "24:" // Height 1: No activation - "cmp x16, #0x10\n" - "bge 33f\n" - "tbz x16, #3, 28f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "tbz x16, #2, 26f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "tbz x16, #1, 25f\n" - "str d11, [x13], #0x8\n" - "tbz x16, #0, 32f\n" - "st1 { v11.s }[2], [x13]\n" - "b 32f\n" - "25:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x16, #0, 32f\n" - "str s11, [x13, #0x0]\n" - "b 32f\n" - "26:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x16, #1, 27f\n" - "str d10, [x13], #0x8\n" - "tbz x16, #0, 32f\n" - "st1 { v10.s }[2], [x13]\n" - "b 32f\n" - "27:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x16, #0, 32f\n" - "str s10, [x13, #0x0]\n" - "b 32f\n" - "28:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x16, #2, 30f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "tbz x16, #1, 29f\n" - "str d9, [x13], #0x8\n" - "tbz x16, #0, 32f\n" - "st1 { v9.s }[2], [x13]\n" - "b 32f\n" - "29:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x16, #0, 32f\n" - "str s9, [x13, #0x0]\n" - "b 32f\n" - "30:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x16, #1, 31f\n" - "str d8, [x13], #0x8\n" - "tbz x16, #0, 32f\n" - "st1 { v8.s }[2], [x13]\n" - "b 32f\n" - "31:" // Height 1: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "32:" // Height 1: Partial direct writeback: Done - "b 34f\n" - "33:" // Height 1: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "add x13, x13, #0x40\n" - "34:" // Height 1: Writeback done - "subs x16, x16, #0x10\n" - "bgt 3b\n" - "b 206f\n" - "35:" // Height 2 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 36f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 37f\n" - "36:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "37:" // Height 2: Column loop - "cbz x14, 38f\n" - "ldr q8, [x14, #0x0]\n" + "23:" // Height 1: No activation + "cmp x11, #0x10\n" + "bge 32f\n" + "tbz x11, #3, 27f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x11, #2, 25f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x11, #1, 24f\n" + "str d11, [x28], #0x8\n" + "tbz x11, #0, 31f\n" + "st1 { v11.s }[2], [x28]\n" + "b 31f\n" + "24:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 31f\n" + "str s11, [x28, #0x0]\n" + "b 31f\n" + "25:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 26f\n" + "str d10, [x28], #0x8\n" + "tbz x11, #0, 31f\n" + "st1 { v10.s }[2], [x28]\n" + "b 31f\n" + "26:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 31f\n" + "str s10, [x28, #0x0]\n" + "b 31f\n" + "27:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 29f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x11, #1, 28f\n" + "str d9, [x28], #0x8\n" + "tbz x11, #0, 31f\n" + "st1 { v9.s }[2], [x28]\n" + "b 31f\n" + "28:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 31f\n" + "str s9, [x28, #0x0]\n" + "b 31f\n" + "29:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 30f\n" + "str d8, [x28], #0x8\n" + "tbz x11, #0, 31f\n" + "st1 { v8.s }[2], [x28]\n" + "b 31f\n" + "30:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "31:" // Height 1: Partial direct writeback: Done + "b 33f\n" + "32:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "33:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 200f\n" + "34:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "35:" // Height 2: Column loop + "cbz x9, 36f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" - "ldr q10, [x14, #0x20]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" "mov v13.16b, v9.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v14.16b, v10.16b\n" - "add x14, x14, #0x40\n" "mov v15.16b, v11.16b\n" - "b 49f\n" - "38:" // Height 2: no bias - "tbz %x[flags], #0, 48f\n" - "cmp x16, #0x10\n" - "bge 47f\n" - "tbz x16, #3, 42f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "tbz x16, #2, 40f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "tbz x16, #1, 39f\n" + "b 47f\n" + "36:" // Height 2: no bias + "tbz %x[flags], #0, 46f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "bge 45f\n" + "tbz x11, #3, 40f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "tbz x11, #2, 38f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "tbz x11, #1, 37f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "tbz x16, #0, 46f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "b 46f\n" - "39:" // Height 2: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "tbz x11, #0, 44f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 46f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "b 46f\n" - "40:" // Height 2: Partial accumulate: partial_2_8 - "tbz x16, #1, 41f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" + "tbz x11, #0, 44f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 39f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 46f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "b 46f\n" - "41:" // Height 2: Partial accumulate: partial_1_8 + "tbz x11, #0, 44f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 46f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "b 46f\n" - "42:" // Height 2: Partial accumulate: partial_4_0 - "tbz x16, #2, 44f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "tbz x16, #1, 43f\n" + "tbz x11, #0, 44f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 42f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "tbz x11, #1, 41f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "tbz x16, #0, 46f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "b 46f\n" - "43:" // Height 2: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "tbz x11, #0, 44f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 46f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "b 46f\n" - "44:" // Height 2: Partial accumulate: partial_2_0 - "tbz x16, #1, 45f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" + "tbz x11, #0, 44f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 43f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 46f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "b 46f\n" - "45:" // Height 2: Partial accumulate: partial_1_0 + "tbz x11, #0, 44f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "46:" // Height 2: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "b 49f\n" - "47:" // Height 2: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "b 49f\n" - "48:" // Height 2: no accumulate + "ldr s12, [x24, #0x0]\n" + "44:" // Height 2: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "b 47f\n" + "46:" // Height 2: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -503,174 +489,176 @@ void a64_hybrid_fp32_mla_6x16 ( "movi v13.16b, #0x0\n" "movi v14.16b, #0x0\n" "movi v15.16b, #0x0\n" - "49:" // Height 2: setup done - "mov x12, #0x0\n" - "50:" // Height 2: String loop + "47:" // Height 2: setup done + "mov x27, #0x0\n" + "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 51f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 52f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 50f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "b 52f\n" - "51:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "52:" // Height 2: input setup done - "cmp x11, #0x4\n" - "blt 55f\n" - "cmp x11, #0x8\n" - "blt 54f\n" - "53:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "50:" // Height 2: input setup done + "cmp x26, #0x4\n" + "blt 53f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x8\n" + "ldr q6, [x10, #0x0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x24, x24, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x4\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "cmp x26, #0x8\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "sub x11, x11, #0x4\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x15, #0x40]\n" - "cmp x11, #0x8\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" - "bge 53b\n" - "54:" // Height 2: Multiply loop: Single iteration only - "sub x11, x11, #0x4\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x4\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" - "55:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 57f\n" - "56:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "sub x11, x11, #0x1\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "add x15, x15, #0x40\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" - "cbnz x11, 56b\n" - "57:" // Height 2: Multiply loop: No odd multiplies + "cbnz x26, 54b\n" + "55:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 50b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbz %x[flags], #1, 58f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 48b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 56f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -691,227 +679,215 @@ void a64_hybrid_fp32_mla_6x16 ( "fmax v14.4s, v14.4s, v1.4s\n" "fmin v15.4s, v15.4s, v0.4s\n" "fmax v15.4s, v15.4s, v1.4s\n" - "58:" // Height 2: No activation - "cmp x16, #0x10\n" - "bge 67f\n" - "tbz x16, #3, 62f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "tbz x16, #2, 60f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "tbz x16, #1, 59f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "tbz x16, #0, 66f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "b 66f\n" - "59:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x16, #0, 66f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "b 66f\n" - "60:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x16, #1, 61f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "tbz x16, #0, 66f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "b 66f\n" - "61:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x16, #0, 66f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "b 66f\n" - "62:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x16, #2, 64f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "tbz x16, #1, 63f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "tbz x16, #0, 66f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "b 66f\n" - "63:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x16, #0, 66f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" + "56:" // Height 2: No activation + "cmp x11, #0x10\n" + "bge 65f\n" + "tbz x11, #3, 60f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "tbz x11, #2, 58f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "tbz x11, #1, 57f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "tbz x11, #0, 64f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "b 64f\n" + "57:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 64f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "b 64f\n" + "58:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 59f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "tbz x11, #0, 64f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "b 64f\n" + "59:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 64f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "b 64f\n" + "60:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 62f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "tbz x11, #1, 61f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "tbz x11, #0, 64f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "b 64f\n" + "61:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 64f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "b 64f\n" + "62:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 63f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "tbz x11, #0, 64f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "b 64f\n" + "63:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "64:" // Height 2: Partial direct writeback: Done "b 66f\n" - "64:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x16, #1, 65f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "tbz x16, #0, 66f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "b 66f\n" - "65:" // Height 2: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "66:" // Height 2: Partial direct writeback: Done - "b 68f\n" - "67:" // Height 2: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "68:" // Height 2: Writeback done - "subs x16, x16, #0x10\n" - "bgt 37b\n" - "b 206f\n" - "69:" // Height 3 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 70f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 71f\n" - "70:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "71:" // Height 3: Column loop - "cbz x14, 72f\n" - "ldr q8, [x14, #0x0]\n" + "65:" // Height 2: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "66:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 35b\n" + "b 200f\n" + "67:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "68:" // Height 3: Column loop + "cbz x9, 69f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" - "ldr q11, [x14, #0x30]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" "mov v13.16b, v9.16b\n" - "add x14, x14, #0x40\n" + "add x9, x9, #0x40\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" - "b 83f\n" - "72:" // Height 3: no bias - "tbz %x[flags], #0, 82f\n" - "cmp x16, #0x10\n" - "bge 81f\n" - "tbz x16, #3, 76f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "tbz x16, #2, 74f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "tbz x16, #1, 73f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "tbz x16, #0, 80f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" "b 80f\n" - "73:" // Height 3: Partial accumulate: partial_1_12 + "69:" // Height 3: no bias + "tbz %x[flags], #0, 79f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 78f\n" + "tbz x11, #3, 73f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #2, 71f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #1, 70f\n" + "mov x19, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "tbz x11, #0, 77f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "b 77f\n" + "70:" // Height 3: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 80f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "b 80f\n" - "74:" // Height 3: Partial accumulate: partial_2_8 - "tbz x16, #1, 75f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" + "tbz x11, #0, 77f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "b 77f\n" + "71:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 72f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 80f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "b 80f\n" - "75:" // Height 3: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "tbz x11, #0, 77f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "b 77f\n" + "72:" // Height 3: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 80f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "b 80f\n" - "76:" // Height 3: Partial accumulate: partial_4_0 - "tbz x16, #2, 78f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "tbz x16, #1, 77f\n" + "tbz x11, #0, 77f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "b 77f\n" + "73:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 75f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "tbz x11, #1, 74f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "tbz x16, #0, 80f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "b 80f\n" - "77:" // Height 3: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "tbz x11, #0, 77f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "b 77f\n" + "74:" // Height 3: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 80f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "b 80f\n" - "78:" // Height 3: Partial accumulate: partial_2_0 - "tbz x16, #1, 79f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" + "tbz x11, #0, 77f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "b 77f\n" + "75:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 76f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 80f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "b 80f\n" - "79:" // Height 3: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "tbz x11, #0, 77f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "b 77f\n" + "76:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "80:" // Height 3: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "b 83f\n" - "81:" // Height 3: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "b 83f\n" - "82:" // Height 3: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "77:" // Height 3: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 80f\n" + "78:" // Height 3: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "b 80f\n" + "79:" // Height 3: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -924,221 +900,224 @@ void a64_hybrid_fp32_mla_6x16 ( "movi v17.16b, #0x0\n" "movi v18.16b, #0x0\n" "movi v19.16b, #0x0\n" - "83:" // Height 3: setup done - "mov x12, #0x0\n" - "84:" // Height 3: String loop + "80:" // Height 3: setup done + "mov x27, #0x0\n" + "81:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 85f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 82f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 86f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 83f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" - "b 86f\n" - "85:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "86:" // Height 3: input setup done - "cmp x11, #0x4\n" - "blt 89f\n" - "cmp x11, #0x8\n" - "blt 88f\n" - "87:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 83f\n" + "82:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "83:" // Height 3: input setup done + "cmp x26, #0x4\n" + "blt 86f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x8\n" + "ldr q2, [x23, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 85f\n" + "84:" // Height 3: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x26, x26, #0x4\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x26, #0x8\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "sub x11, x11, #0x4\n" "fmla v14.4s, v6.4s, v1.s[0]\n" - "cmp x11, #0x8\n" "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" - "bge 87b\n" - "88:" // Height 3: Multiply loop: Single iteration only - "sub x11, x11, #0x4\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "bge 84b\n" + "85:" // Height 3: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x4\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x15, #0x20]\n" + "add x24, x24, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x15, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" - "89:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 91f\n" - "90:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "86:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 88f\n" + "87:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "add x15, x15, #0x40\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" - "cbnz x11, 90b\n" - "91:" // Height 3: Multiply loop: No odd multiplies + "cbnz x26, 87b\n" + "88:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 84b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "tbz %x[flags], #1, 92f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 81b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 89f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1167,137 +1146,120 @@ void a64_hybrid_fp32_mla_6x16 ( "fmin v19.4s, v19.4s, v0.4s\n" "fmax v18.4s, v18.4s, v1.4s\n" "fmax v19.4s, v19.4s, v1.4s\n" - "92:" // Height 3: No activation - "cmp x16, #0x10\n" - "bge 101f\n" - "tbz x16, #3, 96f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "tbz x16, #2, 94f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "tbz x16, #1, 93f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "tbz x16, #0, 100f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "b 100f\n" - "93:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x16, #0, 100f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "b 100f\n" - "94:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x16, #1, 95f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "tbz x16, #0, 100f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "b 100f\n" - "95:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x16, #0, 100f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "b 100f\n" - "96:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x16, #2, 98f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "tbz x16, #1, 97f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "tbz x16, #0, 100f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "b 100f\n" - "97:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x16, #0, 100f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "b 100f\n" - "98:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x16, #1, 99f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "tbz x16, #0, 100f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "b 100f\n" - "99:" // Height 3: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "100:" // Height 3: Partial direct writeback: Done - "b 102f\n" - "101:" // Height 3: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "102:" // Height 3: Writeback done - "subs x16, x16, #0x10\n" - "bgt 71b\n" - "b 206f\n" - "103:" // Height 4 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 104f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 105f\n" - "104:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "105:" // Height 4: Column loop - "cbz x14, 106f\n" - "ldr q8, [x14, #0x0]\n" + "89:" // Height 3: No activation + "cmp x11, #0x10\n" + "bge 98f\n" + "tbz x11, #3, 93f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "tbz x11, #2, 91f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "tbz x11, #1, 90f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "tbz x11, #0, 97f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "b 97f\n" + "90:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 97f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "b 97f\n" + "91:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 92f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "tbz x11, #0, 97f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "b 97f\n" + "92:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 97f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "b 97f\n" + "93:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 95f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "tbz x11, #1, 94f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "tbz x11, #0, 97f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "b 97f\n" + "94:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 97f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "b 97f\n" + "95:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 96f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "tbz x11, #0, 97f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "b 97f\n" + "96:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "97:" // Height 3: Partial direct writeback: Done + "b 99f\n" + "98:" // Height 3: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "99:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 68b\n" + "b 200f\n" + "100:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "101:" // Height 4: Column loop + "cbz x9, 102f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" - "add x14, x14, #0x40\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v13.16b, v9.16b\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" @@ -1307,136 +1269,137 @@ void a64_hybrid_fp32_mla_6x16 ( "mov v21.16b, v9.16b\n" "mov v22.16b, v10.16b\n" "mov v23.16b, v11.16b\n" - "b 117f\n" - "106:" // Height 4: no bias - "tbz %x[flags], #0, 116f\n" - "cmp x16, #0x10\n" - "bge 115f\n" - "tbz x16, #3, 110f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "tbz x16, #2, 108f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "tbz x16, #1, 107f\n" + "b 113f\n" + "102:" // Height 4: no bias + "tbz %x[flags], #0, 112f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 111f\n" + "tbz x11, #3, 106f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "tbz x11, #2, 104f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "tbz x11, #1, 103f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "tbz x16, #0, 114f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "b 114f\n" - "107:" // Height 4: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "tbz x11, #0, 110f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "b 110f\n" + "103:" // Height 4: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 114f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "b 114f\n" - "108:" // Height 4: Partial accumulate: partial_2_8 - "tbz x16, #1, 109f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" + "tbz x11, #0, 110f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "b 110f\n" + "104:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 105f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 114f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "b 114f\n" - "109:" // Height 4: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "tbz x11, #0, 110f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "b 110f\n" + "105:" // Height 4: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 114f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "b 114f\n" - "110:" // Height 4: Partial accumulate: partial_4_0 - "tbz x16, #2, 112f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "tbz x16, #1, 111f\n" + "tbz x11, #0, 110f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "b 110f\n" + "106:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 108f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "tbz x11, #1, 107f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "tbz x16, #0, 114f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "b 114f\n" - "111:" // Height 4: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "tbz x11, #0, 110f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "b 110f\n" + "107:" // Height 4: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 114f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "b 114f\n" - "112:" // Height 4: Partial accumulate: partial_2_0 - "tbz x16, #1, 113f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" + "tbz x11, #0, 110f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "b 110f\n" + "108:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 109f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 114f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "b 114f\n" - "113:" // Height 4: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "tbz x11, #0, 110f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "b 110f\n" + "109:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "114:" // Height 4: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "b 117f\n" - "115:" // Height 4: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "b 117f\n" - "116:" // Height 4: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "110:" // Height 4: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 113f\n" + "111:" // Height 4: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "b 113f\n" + "112:" // Height 4: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -1453,220 +1416,220 @@ void a64_hybrid_fp32_mla_6x16 ( "movi v21.16b, #0x0\n" "movi v22.16b, #0x0\n" "movi v23.16b, #0x0\n" - "117:" // Height 4: setup done - "mov x12, #0x0\n" - "118:" // Height 4: String loop + "113:" // Height 4: setup done + "mov x27, #0x0\n" + "114:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 119f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 115f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 120f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 116f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" - "b 120f\n" - "119:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "120:" // Height 4: input setup done - "cmp x11, #0x4\n" - "blt 123f\n" - "cmp x11, #0x8\n" - "blt 122f\n" - "121:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 116f\n" + "115:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "116:" // Height 4: input setup done + "cmp x26, #0x4\n" + "blt 119f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x8\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 118f\n" + "117:" // Height 4: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x26, x26, #0x4\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x8\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x11, x11, #0x4\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "cmp x11, #0x8\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v22.4s, v6.4s, v3.s[3]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" + "ldr q2, [x23, #0x0]\n" "fmla v23.4s, v7.4s, v3.s[3]\n" - "bge 121b\n" - "122:" // Height 4: Multiply loop: Single iteration only - "sub x11, x11, #0x4\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "bge 117b\n" + "118:" // Height 4: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x4\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x15, #0xf0]\n" - "add x15, x15, #0x100\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" @@ -1675,28 +1638,28 @@ void a64_hybrid_fp32_mla_6x16 ( "fmla v15.4s, v7.4s, v1.s[3]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" "fmla v23.4s, v7.4s, v3.s[3]\n" - "123:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 125f\n" - "124:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "119:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 121f\n" + "120:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "add x15, x15, #0x40\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" @@ -1704,17 +1667,21 @@ void a64_hybrid_fp32_mla_6x16 ( "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" - "cbnz x11, 124b\n" - "125:" // Height 4: Multiply loop: No odd multiplies + "cbnz x26, 120b\n" + "121:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 118b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "tbz %x[flags], #1, 126f\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 114b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 122f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1751,162 +1718,141 @@ void a64_hybrid_fp32_mla_6x16 ( "fmax v21.4s, v21.4s, v1.4s\n" "fmax v22.4s, v22.4s, v1.4s\n" "fmax v23.4s, v23.4s, v1.4s\n" - "126:" // Height 4: No activation - "cmp x16, #0x10\n" - "bge 135f\n" - "tbz x16, #3, 130f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "tbz x16, #2, 128f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "tbz x16, #1, 127f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "tbz x16, #0, 134f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "b 134f\n" - "127:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x16, #0, 134f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "b 134f\n" - "128:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x16, #1, 129f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "tbz x16, #0, 134f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "b 134f\n" - "129:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x16, #0, 134f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "b 134f\n" - "130:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x16, #2, 132f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "tbz x16, #1, 131f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "tbz x16, #0, 134f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "b 134f\n" - "131:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x16, #0, 134f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "b 134f\n" - "132:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x16, #1, 133f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x16, #0, 134f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "b 134f\n" - "133:" // Height 4: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "134:" // Height 4: Partial direct writeback: Done - "b 136f\n" - "135:" // Height 4: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "136:" // Height 4: Writeback done - "subs x16, x16, #0x10\n" - "bgt 105b\n" - "b 206f\n" - "137:" // Height 5 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 138f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 139f\n" - "138:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "139:" // Height 5: Column loop - "cbz x14, 140f\n" - "ldr q8, [x14, #0x0]\n" + "122:" // Height 4: No activation + "cmp x11, #0x10\n" + "bge 131f\n" + "tbz x11, #3, 126f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "tbz x11, #2, 124f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "tbz x11, #1, 123f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "tbz x11, #0, 130f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "b 130f\n" + "123:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 130f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "b 130f\n" + "124:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 125f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "tbz x11, #0, 130f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "b 130f\n" + "125:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 130f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "b 130f\n" + "126:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 128f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "tbz x11, #1, 127f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "tbz x11, #0, 130f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "b 130f\n" + "127:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 130f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "b 130f\n" + "128:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 129f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "tbz x11, #0, 130f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "b 130f\n" + "129:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "130:" // Height 4: Partial direct writeback: Done + "b 132f\n" + "131:" // Height 4: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "132:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 101b\n" + "b 200f\n" + "133:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "134:" // Height 5: Column loop + "cbz x9, 135f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v24.16b, v8.16b\n" - "add x14, x14, #0x40\n" "mov v13.16b, v9.16b\n" "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" @@ -1919,157 +1865,158 @@ void a64_hybrid_fp32_mla_6x16 ( "mov v25.16b, v9.16b\n" "mov v26.16b, v10.16b\n" "mov v27.16b, v11.16b\n" - "b 151f\n" - "140:" // Height 5: no bias - "tbz %x[flags], #0, 150f\n" - "cmp x16, #0x10\n" - "bge 149f\n" - "tbz x16, #3, 144f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "tbz x16, #2, 142f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "tbz x16, #1, 141f\n" + "b 146f\n" + "135:" // Height 5: no bias + "tbz %x[flags], #0, 145f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 144f\n" + "tbz x11, #3, 139f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #2, 137f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #1, 136f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "tbz x16, #0, 148f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "b 148f\n" - "141:" // Height 5: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "tbz x11, #0, 143f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "b 143f\n" + "136:" // Height 5: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 148f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "b 148f\n" - "142:" // Height 5: Partial accumulate: partial_2_8 - "tbz x16, #1, 143f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" + "tbz x11, #0, 143f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "b 143f\n" + "137:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 138f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 148f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "b 148f\n" - "143:" // Height 5: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz x11, #0, 143f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "b 143f\n" + "138:" // Height 5: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 148f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "b 148f\n" - "144:" // Height 5: Partial accumulate: partial_4_0 - "tbz x16, #2, 146f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "tbz x16, #1, 145f\n" + "tbz x11, #0, 143f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "b 143f\n" + "139:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 141f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "tbz x11, #1, 140f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "tbz x16, #0, 148f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "b 148f\n" - "145:" // Height 5: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "tbz x11, #0, 143f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "b 143f\n" + "140:" // Height 5: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 148f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "b 148f\n" - "146:" // Height 5: Partial accumulate: partial_2_0 - "tbz x16, #1, 147f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" + "tbz x11, #0, 143f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "b 143f\n" + "141:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 142f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 148f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "b 148f\n" - "147:" // Height 5: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "tbz x11, #0, 143f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "b 143f\n" + "142:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "148:" // Height 5: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "b 151f\n" - "149:" // Height 5: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "b 151f\n" - "150:" // Height 5: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "143:" // Height 5: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 146f\n" + "144:" // Height 5: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "b 146f\n" + "145:" // Height 5: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -2090,260 +2037,260 @@ void a64_hybrid_fp32_mla_6x16 ( "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" - "151:" // Height 5: setup done - "mov x12, #0x0\n" - "152:" // Height 5: String loop + "146:" // Height 5: setup done + "mov x27, #0x0\n" + "147:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 153f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 148f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 154f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 149f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" - "b 154f\n" - "153:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "154:" // Height 5: input setup done - "cmp x11, #0x4\n" - "blt 157f\n" - "cmp x11, #0x8\n" - "blt 156f\n" - "155:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "add x21, x21, x19, LSL #2\n" + "b 149f\n" + "148:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "149:" // Height 5: input setup done + "cmp x26, #0x4\n" + "blt 152f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x8\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 151f\n" + "150:" // Height 5: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x26, x26, #0x4\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x26, #0x8\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x11, x11, #0x4\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "cmp x11, #0x8\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "fmla v20.4s, v6.4s, v3.s[1]\n" "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" - "add x15, x15, #0x100\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v22.4s, v6.4s, v3.s[3]\n" "fmla v26.4s, v6.4s, v4.s[3]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" + "ldr q2, [x23, #0x0]\n" "fmla v23.4s, v7.4s, v3.s[3]\n" + "ldr q3, [x22, #0x0]\n" "fmla v27.4s, v7.4s, v4.s[3]\n" - "bge 155b\n" - "156:" // Height 5: Multiply loop: Single iteration only - "sub x11, x11, #0x4\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "bge 150b\n" + "151:" // Height 5: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x4\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x22, x22, #0x10\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "fmla v20.4s, v6.4s, v3.s[1]\n" "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" - "add x15, x15, #0x100\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v22.4s, v6.4s, v3.s[3]\n" @@ -2353,31 +2300,31 @@ void a64_hybrid_fp32_mla_6x16 ( "fmla v19.4s, v7.4s, v2.s[3]\n" "fmla v23.4s, v7.4s, v3.s[3]\n" "fmla v27.4s, v7.4s, v4.s[3]\n" - "157:" // Height 5: Multiply loop: Main loop skip - "cbz x11, 159f\n" - "158:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "152:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 154f\n" + "153:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "add x15, x15, #0x40\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" @@ -2387,18 +2334,23 @@ void a64_hybrid_fp32_mla_6x16 ( "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" - "cbnz x11, 158b\n" - "159:" // Height 5: Multiply loop: No odd multiplies + "cbnz x26, 153b\n" + "154:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 152b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 147b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" - "tbz %x[flags], #1, 160f\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 155f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2443,193 +2395,169 @@ void a64_hybrid_fp32_mla_6x16 ( "fmax v26.4s, v26.4s, v1.4s\n" "fmin v27.4s, v27.4s, v0.4s\n" "fmax v27.4s, v27.4s, v1.4s\n" - "160:" // Height 5: No activation - "cmp x16, #0x10\n" - "bge 169f\n" - "tbz x16, #3, 164f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "tbz x16, #2, 162f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "tbz x16, #1, 161f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "tbz x16, #0, 168f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "b 168f\n" - "161:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x16, #0, 168f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "b 168f\n" - "162:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x16, #1, 163f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "tbz x16, #0, 168f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "b 168f\n" - "163:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x16, #0, 168f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "b 168f\n" - "164:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x16, #2, 166f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "tbz x16, #1, 165f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "tbz x16, #0, 168f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "b 168f\n" - "165:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x16, #0, 168f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "b 168f\n" - "166:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x16, #1, 167f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x16, #0, 168f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "b 168f\n" - "167:" // Height 5: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "168:" // Height 5: Partial direct writeback: Done - "b 170f\n" - "169:" // Height 5: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "170:" // Height 5: Writeback done - "subs x16, x16, #0x10\n" - "bgt 139b\n" - "b 206f\n" - "171:" // Height 6 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 172f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 173f\n" - "172:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "173:" // Height 6: Column loop - "cbz x14, 174f\n" - "ldr q8, [x14, #0x0]\n" + "155:" // Height 5: No activation + "cmp x11, #0x10\n" + "bge 164f\n" + "tbz x11, #3, 159f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v25.4s }, [x21], #0x10\n" + "tbz x11, #2, 157f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "tbz x11, #1, 156f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "tbz x11, #0, 163f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "b 163f\n" + "156:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 163f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "b 163f\n" + "157:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 158f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "tbz x11, #0, 163f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "b 163f\n" + "158:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 163f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "b 163f\n" + "159:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 161f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "tbz x11, #1, 160f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "tbz x11, #0, 163f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "b 163f\n" + "160:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 163f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "b 163f\n" + "161:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 162f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "tbz x11, #0, 163f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "b 163f\n" + "162:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "163:" // Height 5: Partial direct writeback: Done + "b 165f\n" + "164:" // Height 5: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "165:" // Height 5: Writeback done + "subs x11, x11, #0x10\n" + "bgt 134b\n" + "b 200f\n" + "166:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "167:" // Height 6: Column loop + "cbz x9, 168f\n" + "ldr q8, [x9, #0x0]\n" "mov v12.16b, v8.16b\n" - "ldr q9, [x14, #0x10]\n" + "ldr q9, [x9, #0x10]\n" "mov v16.16b, v8.16b\n" - "ldr q10, [x14, #0x20]\n" + "ldr q10, [x9, #0x20]\n" "mov v20.16b, v8.16b\n" - "ldr q11, [x14, #0x30]\n" + "ldr q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" "mov v24.16b, v8.16b\n" - "add x14, x14, #0x40\n" "mov v28.16b, v8.16b\n" "mov v13.16b, v9.16b\n" - "mov v17.16b, v9.16b\n" "mov v14.16b, v10.16b\n" "mov v15.16b, v11.16b\n" + "mov v17.16b, v9.16b\n" "mov v18.16b, v10.16b\n" "mov v19.16b, v11.16b\n" "mov v21.16b, v9.16b\n" @@ -2641,178 +2569,179 @@ void a64_hybrid_fp32_mla_6x16 ( "mov v29.16b, v9.16b\n" "mov v30.16b, v10.16b\n" "mov v31.16b, v11.16b\n" - "b 185f\n" - "174:" // Height 6: no bias - "tbz %x[flags], #0, 184f\n" - "cmp x16, #0x10\n" - "bge 183f\n" - "tbz x16, #3, 178f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v29.4s }, [x21], #0x10\n" - "tbz x16, #2, 176f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "ld1 { v30.4s }, [x21], #0x10\n" - "tbz x16, #1, 175f\n" + "b 179f\n" + "168:" // Height 6: no bias + "tbz %x[flags], #0, 178f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 177f\n" + "tbz x11, #3, 172f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x24], #0x10\n" + "ld1 { v17.4s }, [x23], #0x10\n" + "ld1 { v21.4s }, [x22], #0x10\n" + "ld1 { v25.4s }, [x21], #0x10\n" + "ld1 { v29.4s }, [x20], #0x10\n" + "tbz x11, #2, 170f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x24], #0x10\n" + "ld1 { v18.4s }, [x23], #0x10\n" + "ld1 { v22.4s }, [x22], #0x10\n" + "ld1 { v26.4s }, [x21], #0x10\n" + "ld1 { v30.4s }, [x20], #0x10\n" + "tbz x11, #1, 169f\n" "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d31, [x21], #0x8\n" - "tbz x16, #0, 182f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x21]\n" - "b 182f\n" - "175:" // Height 6: Partial accumulate: partial_1_12 + "ldr d11, [x28], #0x8\n" + "ldr d15, [x24], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d27, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x11, #0, 176f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x24]\n" + "ld1 { v19.s }[2], [x23]\n" + "ld1 { v23.s }[2], [x22]\n" + "ld1 { v27.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 176f\n" + "169:" // Height 6: Partial accumulate: partial_1_12 "mov x19, #0x30\n" - "tbz x16, #0, 182f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "ldr s31, [x21, #0x0]\n" - "b 182f\n" - "176:" // Height 6: Partial accumulate: partial_2_8 - "tbz x16, #1, 177f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d30, [x21], #0x8\n" + "tbz x11, #0, 176f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x24, #0x0]\n" + "ldr s19, [x23, #0x0]\n" + "ldr s23, [x22, #0x0]\n" + "ldr s27, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "b 176f\n" + "170:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 171f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x24], #0x8\n" "mov x19, #0x28\n" - "tbz x16, #0, 182f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x21]\n" - "b 182f\n" - "177:" // Height 6: Partial accumulate: partial_1_8 + "ldr d18, [x23], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "ldr d30, [x20], #0x8\n" + "tbz x11, #0, 176f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x24]\n" + "ld1 { v18.s }[2], [x23]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v26.s }[2], [x21]\n" + "ld1 { v30.s }[2], [x20]\n" + "b 176f\n" + "171:" // Height 6: Partial accumulate: partial_1_8 "mov x19, #0x20\n" - "tbz x16, #0, 182f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "ldr s30, [x21, #0x0]\n" - "b 182f\n" - "178:" // Height 6: Partial accumulate: partial_4_0 - "tbz x16, #2, 180f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "tbz x16, #1, 179f\n" + "tbz x11, #0, 176f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x24, #0x0]\n" + "ldr s18, [x23, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s26, [x21, #0x0]\n" + "ldr s30, [x20, #0x0]\n" + "b 176f\n" + "172:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 174f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x24], #0x10\n" + "ld1 { v16.4s }, [x23], #0x10\n" + "ld1 { v20.4s }, [x22], #0x10\n" + "ld1 { v24.4s }, [x21], #0x10\n" + "ld1 { v28.4s }, [x20], #0x10\n" + "tbz x11, #1, 173f\n" "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d29, [x21], #0x8\n" - "tbz x16, #0, 182f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "ld1 { v29.s }[2], [x21]\n" - "b 182f\n" - "179:" // Height 6: Partial accumulate: partial_1_4 + "ldr d9, [x28], #0x8\n" + "ldr d13, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d25, [x21], #0x8\n" + "ldr d29, [x20], #0x8\n" + "tbz x11, #0, 176f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v21.s }[2], [x22]\n" + "ld1 { v25.s }[2], [x21]\n" + "ld1 { v29.s }[2], [x20]\n" + "b 176f\n" + "173:" // Height 6: Partial accumulate: partial_1_4 "mov x19, #0x10\n" - "tbz x16, #0, 182f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "ldr s29, [x21, #0x0]\n" - "b 182f\n" - "180:" // Height 6: Partial accumulate: partial_2_0 - "tbz x16, #1, 181f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d28, [x21], #0x8\n" + "tbz x11, #0, 176f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s21, [x22, #0x0]\n" + "ldr s25, [x21, #0x0]\n" + "ldr s29, [x20, #0x0]\n" + "b 176f\n" + "174:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 175f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x24], #0x8\n" "mov x19, #0x8\n" - "tbz x16, #0, 182f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "ld1 { v28.s }[2], [x21]\n" - "b 182f\n" - "181:" // Height 6: Partial accumulate: partial_1_0 + "ldr d16, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ldr d28, [x20], #0x8\n" + "tbz x11, #0, 176f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x24]\n" + "ld1 { v16.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v24.s }[2], [x21]\n" + "ld1 { v28.s }[2], [x20]\n" + "b 176f\n" + "175:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "ldr s28, [x21, #0x0]\n" - "182:" // Height 6: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "sub x21, x21, x19\n" - "b 185f\n" - "183:" // Height 6: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "ldr q28, [x21, #0x0]\n" - "ldr q29, [x21, #0x10]\n" - "ldr q30, [x21, #0x20]\n" - "ldr q31, [x21, #0x30]\n" - "b 185f\n" - "184:" // Height 6: no accumulate + "ldr s12, [x24, #0x0]\n" + "ldr s16, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s24, [x21, #0x0]\n" + "ldr s28, [x20, #0x0]\n" + "176:" // Height 6: Partial accumulate: Done + "sub x28, x28, x19\n" + "b 179f\n" + "177:" // Height 6: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x24, #0x0]\n" + "ldr q13, [x24, #0x10]\n" + "ldr q14, [x24, #0x20]\n" + "ldr q15, [x24, #0x30]\n" + "ldr q16, [x23, #0x0]\n" + "ldr q17, [x23, #0x10]\n" + "ldr q18, [x23, #0x20]\n" + "ldr q19, [x23, #0x30]\n" + "ldr q20, [x22, #0x0]\n" + "ldr q21, [x22, #0x10]\n" + "ldr q22, [x22, #0x20]\n" + "ldr q23, [x22, #0x30]\n" + "ldr q24, [x21, #0x0]\n" + "ldr q25, [x21, #0x10]\n" + "ldr q26, [x21, #0x20]\n" + "ldr q27, [x21, #0x30]\n" + "ldr q28, [x20, #0x0]\n" + "ldr q29, [x20, #0x10]\n" + "ldr q30, [x20, #0x20]\n" + "ldr q31, [x20, #0x30]\n" + "b 179f\n" + "178:" // Height 6: no accumulate "movi v8.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v10.16b, #0x0\n" @@ -2837,299 +2766,299 @@ void a64_hybrid_fp32_mla_6x16 ( "movi v29.16b, #0x0\n" "movi v30.16b, #0x0\n" "movi v31.16b, #0x0\n" - "185:" // Height 6: setup done - "mov x12, #0x0\n" - "186:" // Height 6: String loop + "179:" // Height 6: setup done + "mov x27, #0x0\n" + "180:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 187f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 181f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 188f\n" + "cbnz x27, 182f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" "add x20, x20, x19, LSL #2\n" - "b 188f\n" - "187:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "add x20, x22, x19, LSL #2\n" - "188:" // Height 6: input setup done - "cmp x11, #0x4\n" - "blt 191f\n" - "cmp x11, #0x8\n" - "blt 190f\n" - "189:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" + "b 182f\n" + "181:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "182:" // Height 6: input setup done + "cmp x26, #0x4\n" + "blt 185f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x8\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "blt 184f\n" + "183:" // Height 6: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "add x25, x25, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x21, x21, #0x10\n" "fmla v28.4s, v6.4s, v5.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" + "add x20, x20, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "sub x26, x26, #0x4\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "add x20, x20, #0x10\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x11, x11, #0x4\n" + "cmp x26, #0x8\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "cmp x11, #0x8\n" "fmla v25.4s, v7.4s, v4.s[0]\n" "fmla v29.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" "fmla v26.4s, v6.4s, v4.s[0]\n" "fmla v30.4s, v6.4s, v5.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" "fmla v31.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "fmla v20.4s, v6.4s, v3.s[1]\n" "fmla v24.4s, v6.4s, v4.s[1]\n" "fmla v28.4s, v6.4s, v5.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" "fmla v25.4s, v7.4s, v4.s[1]\n" "fmla v29.4s, v7.4s, v5.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" "fmla v26.4s, v6.4s, v4.s[1]\n" "fmla v30.4s, v6.4s, v5.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" "fmla v27.4s, v7.4s, v4.s[1]\n" "fmla v31.4s, v7.4s, v5.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" "fmla v24.4s, v6.4s, v4.s[2]\n" "fmla v28.4s, v6.4s, v5.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" "fmla v25.4s, v7.4s, v4.s[2]\n" "fmla v29.4s, v7.4s, v5.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" "fmla v26.4s, v6.4s, v4.s[2]\n" "fmla v30.4s, v6.4s, v5.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" "fmla v27.4s, v7.4s, v4.s[2]\n" "fmla v31.4s, v7.4s, v5.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" "fmla v24.4s, v6.4s, v4.s[3]\n" "fmla v28.4s, v6.4s, v5.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" "fmla v25.4s, v7.4s, v4.s[3]\n" "fmla v29.4s, v7.4s, v5.s[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" - "add x15, x15, #0x100\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v22.4s, v6.4s, v3.s[3]\n" "fmla v26.4s, v6.4s, v4.s[3]\n" "fmla v30.4s, v6.4s, v5.s[3]\n" + "ldr q6, [x10, #0x0]\n" "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr q0, [x25, #0x0]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr q1, [x24, #0x0]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" + "ldr q2, [x23, #0x0]\n" "fmla v23.4s, v7.4s, v3.s[3]\n" + "ldr q3, [x22, #0x0]\n" "fmla v27.4s, v7.4s, v4.s[3]\n" + "ldr q4, [x21, #0x0]\n" "fmla v31.4s, v7.4s, v5.s[3]\n" - "bge 189b\n" - "190:" // Height 6: Multiply loop: Single iteration only - "sub x11, x11, #0x4\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x15, #0x0]\n" + "bge 183b\n" + "184:" // Height 6: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" + "sub x26, x26, #0x4\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x28, x28, #0x10\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - "fmla v28.4s, v6.4s, v5.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x15, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x23, x23, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" "add x22, x22, #0x10\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" + "add x21, x21, #0x10\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ldr q6, [x10, #0x20]\n" "add x20, x20, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" "fmla v29.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" "fmla v26.4s, v6.4s, v4.s[0]\n" "fmla v30.4s, v6.4s, v5.s[0]\n" - "ldr q6, [x15, #0x40]\n" + "ldr q6, [x10, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v15.4s, v7.4s, v1.s[0]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" "fmla v31.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x15, #0x50]\n" + "ldr q7, [x10, #0x50]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "fmla v20.4s, v6.4s, v3.s[1]\n" "fmla v24.4s, v6.4s, v4.s[1]\n" "fmla v28.4s, v6.4s, v5.s[1]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q6, [x10, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" "fmla v13.4s, v7.4s, v1.s[1]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" "fmla v25.4s, v7.4s, v4.s[1]\n" "fmla v29.4s, v7.4s, v5.s[1]\n" - "ldr q7, [x15, #0x70]\n" + "ldr q7, [x10, #0x70]\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" "fmla v26.4s, v6.4s, v4.s[1]\n" "fmla v30.4s, v6.4s, v5.s[1]\n" - "ldr q6, [x15, #0x80]\n" + "ldr q6, [x10, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" "fmla v15.4s, v7.4s, v1.s[1]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" "fmla v27.4s, v7.4s, v4.s[1]\n" "fmla v31.4s, v7.4s, v5.s[1]\n" - "ldr q7, [x15, #0x90]\n" + "ldr q7, [x10, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" "fmla v24.4s, v6.4s, v4.s[2]\n" "fmla v28.4s, v6.4s, v5.s[2]\n" - "ldr q6, [x15, #0xa0]\n" + "ldr q6, [x10, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" "fmla v13.4s, v7.4s, v1.s[2]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" "fmla v25.4s, v7.4s, v4.s[2]\n" "fmla v29.4s, v7.4s, v5.s[2]\n" - "ldr q7, [x15, #0xb0]\n" + "ldr q7, [x10, #0xb0]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" "fmla v26.4s, v6.4s, v4.s[2]\n" "fmla v30.4s, v6.4s, v5.s[2]\n" - "ldr q6, [x15, #0xc0]\n" + "ldr q6, [x10, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" "fmla v15.4s, v7.4s, v1.s[2]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" "fmla v27.4s, v7.4s, v4.s[2]\n" "fmla v31.4s, v7.4s, v5.s[2]\n" - "ldr q7, [x15, #0xd0]\n" + "ldr q7, [x10, #0xd0]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" "fmla v24.4s, v6.4s, v4.s[3]\n" "fmla v28.4s, v6.4s, v5.s[3]\n" - "ldr q6, [x15, #0xe0]\n" + "ldr q6, [x10, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" "fmla v25.4s, v7.4s, v4.s[3]\n" "fmla v29.4s, v7.4s, v5.s[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" - "add x15, x15, #0x100\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v22.4s, v6.4s, v3.s[3]\n" @@ -3141,34 +3070,34 @@ void a64_hybrid_fp32_mla_6x16 ( "fmla v23.4s, v7.4s, v3.s[3]\n" "fmla v27.4s, v7.4s, v4.s[3]\n" "fmla v31.4s, v7.4s, v5.s[3]\n" - "191:" // Height 6: Multiply loop: Main loop skip - "cbz x11, 193f\n" - "192:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" + "185:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 187f\n" + "186:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x1\n" + "ldr s1, [x24], #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" "ldr s5, [x20], #0x4\n" - "ldr q6, [x15, #0x0]\n" + "ldr q6, [x10, #0x0]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x10]\n" + "ldr q7, [x10, #0x10]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "sub x11, x11, #0x1\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" "fmla v24.4s, v6.4s, v4.s[0]\n" "fmla v28.4s, v6.4s, v5.s[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q6, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" "fmla v29.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x15, #0x30]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "add x15, x15, #0x40\n" "fmla v14.4s, v6.4s, v1.s[0]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" @@ -3180,19 +3109,25 @@ void a64_hybrid_fp32_mla_6x16 ( "fmla v23.4s, v7.4s, v3.s[0]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" "fmla v31.4s, v7.4s, v5.s[0]\n" - "cbnz x11, 192b\n" - "193:" // Height 6: Multiply loop: No odd multiplies + "cbnz x26, 186b\n" + "187:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 186b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 180b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x24, x28, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" "prfm pstl1keep, [x21, #0x0]\n" - "tbz %x[flags], #1, 194f\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbz %x[flags], #1, 188f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v1.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -3245,185 +3180,180 @@ void a64_hybrid_fp32_mla_6x16 ( "fmin v31.4s, v31.4s, v0.4s\n" "fmax v30.4s, v30.4s, v1.4s\n" "fmax v31.4s, v31.4s, v1.4s\n" - "194:" // Height 6: No activation - "cmp x16, #0x10\n" - "bge 203f\n" - "tbz x16, #3, 198f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "st1 { v29.4s }, [x21], #0x10\n" - "tbz x16, #2, 196f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "st1 { v30.4s }, [x21], #0x10\n" - "tbz x16, #1, 195f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x16, #0, 202f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "st1 { v31.s }[2], [x21]\n" - "b 202f\n" - "195:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x16, #0, 202f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "str s31, [x21, #0x0]\n" - "b 202f\n" - "196:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x16, #1, 197f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "str d30, [x21], #0x8\n" - "tbz x16, #0, 202f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "st1 { v30.s }[2], [x21]\n" - "b 202f\n" - "197:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x16, #0, 202f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "str s30, [x21, #0x0]\n" - "b 202f\n" - "198:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x16, #2, 200f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "tbz x16, #1, 199f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "str d29, [x21], #0x8\n" - "tbz x16, #0, 202f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "st1 { v29.s }[2], [x21]\n" - "b 202f\n" - "199:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x16, #0, 202f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "str s29, [x21, #0x0]\n" - "b 202f\n" - "200:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x16, #1, 201f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x16, #0, 202f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "st1 { v28.s }[2], [x21]\n" - "b 202f\n" - "201:" // Height 6: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "str s28, [x21, #0x0]\n" - "202:" // Height 6: Partial direct writeback: Done - "b 204f\n" - "203:" // Height 6: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "str q28, [x21, #0x0]\n" - "str q29, [x21, #0x10]\n" - "str q30, [x21, #0x20]\n" - "str q31, [x21, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "add x21, x21, #0x40\n" - "204:" // Height 6: Writeback done - "subs x16, x16, #0x10\n" - "bgt 173b\n" + "188:" // Height 6: No activation + "cmp x11, #0x10\n" + "bge 197f\n" + "tbz x11, #3, 192f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v13.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v17.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v21.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v25.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "st1 { v29.4s }, [x20], #0x10\n" + "tbz x11, #2, 190f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x24], #0x10\n" + "st1 { v18.4s }, [x23], #0x10\n" + "st1 { v22.4s }, [x22], #0x10\n" + "st1 { v26.4s }, [x21], #0x10\n" + "st1 { v30.4s }, [x20], #0x10\n" + "tbz x11, #1, 189f\n" + "str d11, [x28], #0x8\n" + "str d15, [x24], #0x8\n" + "str d19, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d27, [x21], #0x8\n" + "str d31, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x24]\n" + "st1 { v19.s }[2], [x23]\n" + "st1 { v23.s }[2], [x22]\n" + "st1 { v27.s }[2], [x21]\n" + "st1 { v31.s }[2], [x20]\n" + "b 196f\n" + "189:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 196f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x24, #0x0]\n" + "str s19, [x23, #0x0]\n" + "str s23, [x22, #0x0]\n" + "str s27, [x21, #0x0]\n" + "str s31, [x20, #0x0]\n" + "b 196f\n" + "190:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 191f\n" + "str d10, [x28], #0x8\n" + "str d14, [x24], #0x8\n" + "str d18, [x23], #0x8\n" + "str d22, [x22], #0x8\n" + "str d26, [x21], #0x8\n" + "str d30, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x24]\n" + "st1 { v18.s }[2], [x23]\n" + "st1 { v22.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21]\n" + "st1 { v30.s }[2], [x20]\n" + "b 196f\n" + "191:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 196f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x24, #0x0]\n" + "str s18, [x23, #0x0]\n" + "str s22, [x22, #0x0]\n" + "str s26, [x21, #0x0]\n" + "str s30, [x20, #0x0]\n" + "b 196f\n" + "192:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 194f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x24], #0x10\n" + "st1 { v16.4s }, [x23], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v24.4s }, [x21], #0x10\n" + "st1 { v28.4s }, [x20], #0x10\n" + "tbz x11, #1, 193f\n" + "str d9, [x28], #0x8\n" + "str d13, [x24], #0x8\n" + "str d17, [x23], #0x8\n" + "str d21, [x22], #0x8\n" + "str d25, [x21], #0x8\n" + "str d29, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x24]\n" + "st1 { v17.s }[2], [x23]\n" + "st1 { v21.s }[2], [x22]\n" + "st1 { v25.s }[2], [x21]\n" + "st1 { v29.s }[2], [x20]\n" + "b 196f\n" + "193:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 196f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x24, #0x0]\n" + "str s17, [x23, #0x0]\n" + "str s21, [x22, #0x0]\n" + "str s25, [x21, #0x0]\n" + "str s29, [x20, #0x0]\n" + "b 196f\n" + "194:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 195f\n" + "str d8, [x28], #0x8\n" + "str d12, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d20, [x22], #0x8\n" + "str d24, [x21], #0x8\n" + "str d28, [x20], #0x8\n" + "tbz x11, #0, 196f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x24]\n" + "st1 { v16.s }[2], [x23]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v24.s }[2], [x21]\n" + "st1 { v28.s }[2], [x20]\n" + "b 196f\n" + "195:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x24, #0x0]\n" + "str s16, [x23, #0x0]\n" + "str s20, [x22, #0x0]\n" + "str s24, [x21, #0x0]\n" + "str s28, [x20, #0x0]\n" + "196:" // Height 6: Partial direct writeback: Done + "b 198f\n" + "197:" // Height 6: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x24, #0x0]\n" + "str q13, [x24, #0x10]\n" + "str q14, [x24, #0x20]\n" + "str q15, [x24, #0x30]\n" + "str q16, [x23, #0x0]\n" + "str q17, [x23, #0x10]\n" + "str q18, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q20, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q22, [x22, #0x20]\n" + "str q23, [x22, #0x30]\n" + "str q24, [x21, #0x0]\n" + "str q25, [x21, #0x10]\n" + "str q26, [x21, #0x20]\n" + "str q27, [x21, #0x30]\n" + "str q28, [x20, #0x0]\n" + "str q29, [x20, #0x10]\n" + "str q30, [x20, #0x20]\n" + "str q31, [x20, #0x30]\n" + "198:" // Height 6: Writeback done + "subs x11, x11, #0x10\n" + "bgt 167b\n" "subs %x[M], %x[M], #0x6\n" - "beq 206f\n" + "beq 200f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 205f\n" + "tbz %x[flags], #3, 199f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "205:" // Update direct input + "199:" // Update direct input "mov x19, #0x18\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "206:" // Exit + "200:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp index 2b5cdae652..957754ad68 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp @@ -36,9 +36,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_fp32_mla_8x4( ARGLIST ); +void a64_hybrid_fp32_mla_8x4_a55( ARGLIST ); class cls_a64_hybrid_fp32_mla_8x4 { @@ -73,9 +73,16 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_fp32_mla_8x4; - - cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *) + cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + case CPUModel::A53: + kernel=a64_hybrid_fp32_mla_8x4_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp new file mode 100644 index 0000000000..99920002b2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp @@ -0,0 +1,2215 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_8x4_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const float *B_ptr, IndirectOutputArg output_arg, + const float *bias, Activation act, bool accumulate +) +{ + struct KernelArgs { + float maxval = static_cast(std::numeric_limits::infinity()); + float minval = - static_cast(std::numeric_limits::infinity()); + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const float *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + switch(act.type) { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + ka.maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + ka.minval = 0; + flags |= 0x2; + break; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x8\n" + "bge 148f\n" + "cmp %x[M], #0x6\n" + "bgt 127f\n" + "beq 106f\n" + "cmp %x[M], #0x4\n" + "bgt 85f\n" + "beq 64f\n" + "cmp %x[M], #0x2\n" + "bgt 43f\n" + "beq 22f\n" + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x15, %x[bias]\n" + "mov x14, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x15, 3f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "b 8f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 7f\n" + "cmp x17, #0x4\n" + "bge 6f\n" + "tbz x17, #1, 4f\n" + "ldr d24, [x14], #0x8\n" + "mov x19, #0x8\n" + "tbz x17, #0, 5f\n" + "ld1 { v24.s }[2], [x14]\n" + "b 5f\n" + "4:" // Height 1: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "5:" // Height 1: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 8f\n" + "6:" // Height 1: full accumulate + "ldr q24, [x14, #0x0]\n" + "b 8f\n" + "7:" // Height 1: no accumulate + "movi v24.16b, #0x0\n" + "8:" // Height 1: setup done + "mov x13, #0x0\n" + "9:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 10f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "cbnz x13, 11f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "b 11f\n" + "10:" // Height 1: setup direct input + "mov x11, %x[input_ptr]\n" + "11:" // Height 1: input setup done + "cmp x12, #0x4\n" + "blt 14f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "cmp x12, #0x8\n" + "blt 13f\n" + "12:" // Height 1: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "ldr x19, [x16, #0x18]\n" + "add x11, x11, #0x10\n" + "ldr d10, [x16, #0x20]\n" + "sub x12, x12, #0x4\n" + "ldr x21, [x16, #0x28]\n" + "cmp x12, #0x8\n" + "mov v9.d[1], x19\n" + "ldr d11, [x16, #0x30]\n" + "ldr x19, [x16, #0x38]\n" + "add x16, x16, #0x40\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "mov v10.d[1], x21\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v11.d[1], x19\n" + "ldr d8, [x16, #0x0]\n" + "ldr x26, [x16, #0x8]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "ldr x10, [x11, #0x8]\n" + "mov v8.d[1], x26\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "mov v0.d[1], x10\n" + "bge 12b\n" + "13:" // Height 1: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "sub x12, x12, #0x4\n" + "ldr q11, [x16, #0x30]\n" + "add x11, x11, #0x10\n" + "prfm pldl1keep, [x11, #0x80]\n" + "add x16, x16, #0x40\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "14:" // Height 1: Multiply loop: Main loop skip + "cbz x12, 16f\n" + "15:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "cbnz x12, 15b\n" + "16:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 9b\n" + "prfm pstl1keep, [x14, #0x0]\n" + "tbz %x[flags], #1, 17f\n" + "add x19, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.4s }, [x19]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "17:" // Height 1: No activation + "cmp x17, #0x4\n" + "bge 20f\n" + "tbz x17, #1, 18f\n" + "str d24, [x14], #0x8\n" + "tbz x17, #0, 19f\n" + "st1 { v24.s }[2], [x14]\n" + "b 19f\n" + "18:" // Height 1: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "19:" // Height 1: Partial direct writeback: Done + "b 21f\n" + "20:" // Height 1: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "21:" // Height 1: Writeback done + "subs x17, x17, #0x4\n" + "bgt 2b\n" + "b 170f\n" + "22:" // Height 2 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "23:" // Height 2: Column loop + "cbz x15, 24f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "b 29f\n" + "24:" // Height 2: no bias + "tbz %x[flags], #0, 28f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "bge 27f\n" + "tbz x17, #1, 25f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "tbz x17, #0, 26f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "b 26f\n" + "25:" // Height 2: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "26:" // Height 2: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 29f\n" + "27:" // Height 2: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "b 29f\n" + "28:" // Height 2: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "29:" // Height 2: setup done + "mov x13, #0x0\n" + "30:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "cbnz x13, 32f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "b 32f\n" + "31:" // Height 2: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "32:" // Height 2: input setup done + "cmp x12, #0x4\n" + "blt 35f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q8, [x16, #0x0]\n" + "blt 34f\n" + "33:" // Height 2: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "ldr d10, [x16, #0x20]\n" + "add x11, x11, #0x10\n" + "ldr x21, [x16, #0x28]\n" + "add x9, x9, #0x10\n" + "mov v9.d[1], x19\n" + "ldr d11, [x16, #0x30]\n" + "ldr x19, [x16, #0x38]\n" + "sub x12, x12, #0x4\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "mov v10.d[1], x21\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v11.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x10, [x11, #0x8]\n" + "cmp x12, #0x8\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "ldr x28, [x9, #0x8]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "add x16, x16, #0x40\n" + "ldr d8, [x16, #0x0]\n" + "ldr x26, [x16, #0x8]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "mov v8.d[1], x26\n" + "mov v0.d[1], x10\n" + "mov v1.d[1], x28\n" + "bge 33b\n" + "34:" // Height 2: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "ldr q11, [x16, #0x30]\n" + "sub x12, x12, #0x4\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x16, x16, #0x40\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "35:" // Height 2: Multiply loop: Main loop skip + "cbz x12, 37f\n" + "36:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "cbnz x12, 36b\n" + "37:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 30b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "tbz %x[flags], #1, 38f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "38:" // Height 2: No activation + "cmp x17, #0x4\n" + "bge 41f\n" + "tbz x17, #1, 39f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "tbz x17, #0, 40f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "b 40f\n" + "39:" // Height 2: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "40:" // Height 2: Partial direct writeback: Done + "b 42f\n" + "41:" // Height 2: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "42:" // Height 2: Writeback done + "subs x17, x17, #0x4\n" + "bgt 23b\n" + "b 170f\n" + "43:" // Height 3 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "44:" // Height 3: Column loop + "cbz x15, 45f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "mov v26.16b, v24.16b\n" + "b 50f\n" + "45:" // Height 3: no bias + "tbz %x[flags], #0, 49f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "add x26, x27, x19, LSL #2\n" + "bge 48f\n" + "tbz x17, #1, 46f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x26], #0x8\n" + "tbz x17, #0, 47f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "ld1 { v26.s }[2], [x26]\n" + "b 47f\n" + "46:" // Height 3: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "47:" // Height 3: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 50f\n" + "48:" // Height 3: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "ldr q26, [x26, #0x0]\n" + "b 50f\n" + "49:" // Height 3: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "50:" // Height 3: setup done + "mov x13, #0x0\n" + "51:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 52f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "cbnz x13, 53f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "b 53f\n" + "52:" // Height 3: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "53:" // Height 3: input setup done + "cmp x12, #0x4\n" + "blt 56f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q2, [x27, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "blt 55f\n" + "54:" // Height 3: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr d10, [x16, #0x20]\n" + "ldr x21, [x16, #0x28]\n" + "add x11, x11, #0x10\n" + "mov v9.d[1], x19\n" + "ldr d11, [x16, #0x30]\n" + "ldr x19, [x16, #0x38]\n" + "add x9, x9, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "mov v10.d[1], x21\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "mov v11.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x27, x27, #0x10\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "ldr x10, [x11, #0x8]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr x28, [x9, #0x8]\n" + "ldr x26, [x27, #0x8]\n" + "sub x12, x12, #0x4\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr d2, [x27, #0x0]\n" + "mov v0.d[1], x10\n" + "cmp x12, #0x8\n" + "mov v1.d[1], x28\n" + "add x16, x16, #0x40\n" + "mov v2.d[1], x26\n" + "ldr d8, [x16, #0x0]\n" + "ldr x26, [x16, #0x8]\n" + "mov v8.d[1], x26\n" + "bge 54b\n" + "55:" // Height 3: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x16, #0x30]\n" + "sub x12, x12, #0x4\n" + "add x11, x11, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "add x9, x9, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x27, x27, #0x10\n" + "add x16, x16, #0x40\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "56:" // Height 3: Multiply loop: Main loop skip + "cbz x12, 58f\n" + "57:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "cbnz x12, 57b\n" + "58:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 51b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add x26, x27, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbz %x[flags], #1, 59f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "59:" // Height 3: No activation + "cmp x17, #0x4\n" + "bge 62f\n" + "tbz x17, #1, 60f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "str d26, [x26], #0x8\n" + "tbz x17, #0, 61f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "st1 { v26.s }[2], [x26]\n" + "b 61f\n" + "60:" // Height 3: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "str s26, [x26, #0x0]\n" + "61:" // Height 3: Partial direct writeback: Done + "b 63f\n" + "62:" // Height 3: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "str q26, [x26, #0x0]\n" + "63:" // Height 3: Writeback done + "subs x17, x17, #0x4\n" + "bgt 44b\n" + "b 170f\n" + "64:" // Height 4 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "65:" // Height 4: Column loop + "cbz x15, 66f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "b 71f\n" + "66:" // Height 4: no bias + "tbz %x[flags], #0, 70f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "bge 69f\n" + "tbz x17, #1, 67f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "tbz x17, #0, 68f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "ld1 { v26.s }[2], [x26]\n" + "ld1 { v27.s }[2], [x25]\n" + "b 68f\n" + "67:" // Height 4: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "ldr s27, [x25, #0x0]\n" + "68:" // Height 4: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 71f\n" + "69:" // Height 4: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "ldr q26, [x26, #0x0]\n" + "ldr q27, [x25, #0x0]\n" + "b 71f\n" + "70:" // Height 4: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "71:" // Height 4: setup done + "mov x13, #0x0\n" + "72:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 73f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "cbnz x13, 74f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 74f\n" + "73:" // Height 4: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "74:" // Height 4: input setup done + "cmp x12, #0x4\n" + "blt 77f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x25, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "blt 76f\n" + "75:" // Height 4: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr d10, [x16, #0x20]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "ldr x21, [x16, #0x28]\n" + "mov v9.d[1], x19\n" + "ldr d11, [x16, #0x30]\n" + "ldr x19, [x16, #0x38]\n" + "add x11, x11, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "mov v10.d[1], x21\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "mov v11.d[1], x19\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr x10, [x11, #0x8]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x9, x9, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr x28, [x9, #0x8]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "add x27, x27, #0x10\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "ldr d2, [x27, #0x0]\n" + "mov v0.d[1], x10\n" + "ldr x26, [x27, #0x8]\n" + "mov v1.d[1], x28\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x12, x12, #0x4\n" + "mov v2.d[1], x26\n" + "ldr d3, [x25, #0x0]\n" + "ldr x19, [x25, #0x8]\n" + "cmp x12, #0x8\n" + "add x16, x16, #0x40\n" + "ldr d8, [x16, #0x0]\n" + "mov v3.d[1], x19\n" + "ldr x26, [x16, #0x8]\n" + "mov v8.d[1], x26\n" + "bge 75b\n" + "76:" // Height 4: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x16, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "sub x12, x12, #0x4\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "add x11, x11, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "add x9, x9, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x27, x27, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "add x25, x25, #0x10\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "add x16, x16, #0x40\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "77:" // Height 4: Multiply loop: Main loop skip + "cbz x12, 79f\n" + "78:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x25], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "cbnz x12, 78b\n" + "79:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 72b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add x26, x27, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 80f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "80:" // Height 4: No activation + "cmp x17, #0x4\n" + "bge 83f\n" + "tbz x17, #1, 81f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "str d26, [x26], #0x8\n" + "str d27, [x25], #0x8\n" + "tbz x17, #0, 82f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "st1 { v26.s }[2], [x26]\n" + "st1 { v27.s }[2], [x25]\n" + "b 82f\n" + "81:" // Height 4: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "str s26, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "82:" // Height 4: Partial direct writeback: Done + "b 84f\n" + "83:" // Height 4: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "str q26, [x26, #0x0]\n" + "str q27, [x25, #0x0]\n" + "84:" // Height 4: Writeback done + "subs x17, x17, #0x4\n" + "bgt 65b\n" + "b 170f\n" + "85:" // Height 5 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "86:" // Height 5: Column loop + "cbz x15, 87f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "b 92f\n" + "87:" // Height 5: no bias + "tbz %x[flags], #0, 91f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "bge 90f\n" + "tbz x17, #1, 88f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d28, [x24], #0x8\n" + "tbz x17, #0, 89f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "ld1 { v26.s }[2], [x26]\n" + "ld1 { v27.s }[2], [x25]\n" + "ld1 { v28.s }[2], [x24]\n" + "b 89f\n" + "88:" // Height 5: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "ldr s27, [x25, #0x0]\n" + "ldr s28, [x24, #0x0]\n" + "89:" // Height 5: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 92f\n" + "90:" // Height 5: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "ldr q26, [x26, #0x0]\n" + "ldr q27, [x25, #0x0]\n" + "ldr q28, [x24, #0x0]\n" + "b 92f\n" + "91:" // Height 5: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "92:" // Height 5: setup done + "mov x13, #0x0\n" + "93:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 94f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "cbnz x13, 95f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 95f\n" + "94:" // Height 5: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "95:" // Height 5: input setup done + "cmp x12, #0x4\n" + "blt 98f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x25, #0x0]\n" + "ldr q4, [x24, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "blt 97f\n" + "96:" // Height 5: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr d10, [x16, #0x20]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "ldr x21, [x16, #0x28]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "mov v9.d[1], x19\n" + "ldr d11, [x16, #0x30]\n" + "add x11, x11, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "mov v10.d[1], x21\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "ldr x19, [x16, #0x38]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr x10, [x11, #0x8]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "mov v11.d[1], x19\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x9, x9, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr x28, [x9, #0x8]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "add x27, x27, #0x10\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr d2, [x27, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "mov v0.d[1], x10\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "mov v1.d[1], x28\n" + "ldr x26, [x27, #0x8]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x12, x12, #0x4\n" + "mov v2.d[1], x26\n" + "ldr d3, [x25, #0x0]\n" + "ldr x19, [x25, #0x8]\n" + "cmp x12, #0x8\n" + "ldr d4, [x24, #0x0]\n" + "add x16, x16, #0x40\n" + "ldr x21, [x24, #0x8]\n" + "mov v3.d[1], x19\n" + "ldr d8, [x16, #0x0]\n" + "ldr x26, [x16, #0x8]\n" + "mov v4.d[1], x21\n" + "mov v8.d[1], x26\n" + "bge 96b\n" + "97:" // Height 5: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x16, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "sub x12, x12, #0x4\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x11, x11, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "add x9, x9, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "add x27, x27, #0x10\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x25, x25, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "add x24, x24, #0x10\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "add x16, x16, #0x40\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "98:" // Height 5: Multiply loop: Main loop skip + "cbz x12, 100f\n" + "99:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x25], #0x4\n" + "ldr s4, [x24], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "cbnz x12, 99b\n" + "100:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 93b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add x26, x27, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 101f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "101:" // Height 5: No activation + "cmp x17, #0x4\n" + "bge 104f\n" + "tbz x17, #1, 102f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "str d26, [x26], #0x8\n" + "str d27, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "tbz x17, #0, 103f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "st1 { v26.s }[2], [x26]\n" + "st1 { v27.s }[2], [x25]\n" + "st1 { v28.s }[2], [x24]\n" + "b 103f\n" + "102:" // Height 5: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "str s26, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "str s28, [x24, #0x0]\n" + "103:" // Height 5: Partial direct writeback: Done + "b 105f\n" + "104:" // Height 5: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "str q26, [x26, #0x0]\n" + "str q27, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "105:" // Height 5: Writeback done + "subs x17, x17, #0x4\n" + "bgt 86b\n" + "b 170f\n" + "106:" // Height 6 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "107:" // Height 6: Column loop + "cbz x15, 108f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "mov v29.16b, v24.16b\n" + "b 113f\n" + "108:" // Height 6: no bias + "tbz %x[flags], #0, 112f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 111f\n" + "tbz x17, #1, 109f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d28, [x24], #0x8\n" + "ldr d29, [x23], #0x8\n" + "tbz x17, #0, 110f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "ld1 { v26.s }[2], [x26]\n" + "ld1 { v27.s }[2], [x25]\n" + "ld1 { v28.s }[2], [x24]\n" + "ld1 { v29.s }[2], [x23]\n" + "b 110f\n" + "109:" // Height 6: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "ldr s27, [x25, #0x0]\n" + "ldr s28, [x24, #0x0]\n" + "ldr s29, [x23, #0x0]\n" + "110:" // Height 6: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 113f\n" + "111:" // Height 6: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "ldr q26, [x26, #0x0]\n" + "ldr q27, [x25, #0x0]\n" + "ldr q28, [x24, #0x0]\n" + "ldr q29, [x23, #0x0]\n" + "b 113f\n" + "112:" // Height 6: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "113:" // Height 6: setup done + "mov x13, #0x0\n" + "114:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 115f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "cbnz x13, 116f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 116f\n" + "115:" // Height 6: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "116:" // Height 6: input setup done + "cmp x12, #0x4\n" + "blt 119f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x25, #0x0]\n" + "ldr q4, [x24, #0x0]\n" + "ldr q5, [x23, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "blt 118f\n" + "117:" // Height 6: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr d10, [x16, #0x20]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "ldr x21, [x16, #0x28]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "mov v9.d[1], x19\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "ldr d11, [x16, #0x30]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "mov v10.d[1], x21\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "ldr x19, [x16, #0x38]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "add x11, x11, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "mov v11.d[1], x19\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "ldr x10, [x11, #0x8]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x9, x9, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr x28, [x9, #0x8]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "add x27, x27, #0x10\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "ldr x26, [x27, #0x8]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr d2, [x27, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "mov v0.d[1], x10\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "mov v1.d[1], x28\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "mov v2.d[1], x26\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x12, x12, #0x4\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x12, #0x8\n" + "ldr d3, [x25, #0x0]\n" + "add x16, x16, #0x40\n" + "ldr x19, [x25, #0x8]\n" + "ldr d4, [x24, #0x0]\n" + "ldr x21, [x24, #0x8]\n" + "mov v3.d[1], x19\n" + "ldr d5, [x23, #0x0]\n" + "ldr x19, [x23, #0x8]\n" + "mov v4.d[1], x21\n" + "ldr d8, [x16, #0x0]\n" + "ldr x26, [x16, #0x8]\n" + "mov v5.d[1], x19\n" + "mov v8.d[1], x26\n" + "bge 117b\n" + "118:" // Height 6: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x16, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "sub x12, x12, #0x4\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x11, x11, #0x10\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "add x9, x9, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "add x27, x27, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "add x25, x25, #0x10\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x24, x24, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "add x23, x23, #0x10\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "add x16, x16, #0x40\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "119:" // Height 6: Multiply loop: Main loop skip + "cbz x12, 121f\n" + "120:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x25], #0x4\n" + "ldr s4, [x24], #0x4\n" + "ldr s5, [x23], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "fmla v29.4s, v12.4s, v5.s[0]\n" + "cbnz x12, 120b\n" + "121:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 114b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add x26, x27, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 122f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "122:" // Height 6: No activation + "cmp x17, #0x4\n" + "bge 125f\n" + "tbz x17, #1, 123f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "str d26, [x26], #0x8\n" + "str d27, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "str d29, [x23], #0x8\n" + "tbz x17, #0, 124f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "st1 { v26.s }[2], [x26]\n" + "st1 { v27.s }[2], [x25]\n" + "st1 { v28.s }[2], [x24]\n" + "st1 { v29.s }[2], [x23]\n" + "b 124f\n" + "123:" // Height 6: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "str s26, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "str s28, [x24, #0x0]\n" + "str s29, [x23, #0x0]\n" + "124:" // Height 6: Partial direct writeback: Done + "b 126f\n" + "125:" // Height 6: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "str q26, [x26, #0x0]\n" + "str q27, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "str q29, [x23, #0x0]\n" + "126:" // Height 6: Writeback done + "subs x17, x17, #0x4\n" + "bgt 107b\n" + "b 170f\n" + "127:" // Height 7 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "128:" // Height 7: Column loop + "cbz x15, 129f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "mov v29.16b, v24.16b\n" + "mov v30.16b, v24.16b\n" + "b 134f\n" + "129:" // Height 7: no bias + "tbz %x[flags], #0, 133f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 132f\n" + "tbz x17, #1, 130f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d28, [x24], #0x8\n" + "ldr d29, [x23], #0x8\n" + "ldr d30, [x22], #0x8\n" + "tbz x17, #0, 131f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "ld1 { v26.s }[2], [x26]\n" + "ld1 { v27.s }[2], [x25]\n" + "ld1 { v28.s }[2], [x24]\n" + "ld1 { v29.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x22]\n" + "b 131f\n" + "130:" // Height 7: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "ldr s27, [x25, #0x0]\n" + "ldr s28, [x24, #0x0]\n" + "ldr s29, [x23, #0x0]\n" + "ldr s30, [x22, #0x0]\n" + "131:" // Height 7: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 134f\n" + "132:" // Height 7: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "ldr q26, [x26, #0x0]\n" + "ldr q27, [x25, #0x0]\n" + "ldr q28, [x24, #0x0]\n" + "ldr q29, [x23, #0x0]\n" + "ldr q30, [x22, #0x0]\n" + "b 134f\n" + "133:" // Height 7: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "134:" // Height 7: setup done + "mov x13, #0x0\n" + "135:" // Height 7: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 136f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "cbnz x13, 137f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 137f\n" + "136:" // Height 7: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "137:" // Height 7: input setup done + "cmp x12, #0x4\n" + "blt 140f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x25, #0x0]\n" + "ldr q4, [x24, #0x0]\n" + "ldr q5, [x23, #0x0]\n" + "ldr q6, [x22, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "blt 139f\n" + "138:" // Height 7: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr d10, [x16, #0x20]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "ldr x21, [x16, #0x28]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "mov v9.d[1], x19\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "ldr d11, [x16, #0x30]\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" + "mov v10.d[1], x21\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "ldr x19, [x16, #0x38]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "add x11, x11, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "mov v11.d[1], x19\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "ldr x10, [x11, #0x8]\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "add x9, x9, #0x10\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "ldr x28, [x9, #0x8]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "ldr x26, [x27, #0x8]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "add x25, x25, #0x10\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "ldr x19, [x25, #0x8]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr d2, [x27, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "mov v0.d[1], x10\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "mov v1.d[1], x28\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "mov v2.d[1], x26\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "ldr d3, [x25, #0x0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x22, x22, #0x10\n" + "mov v3.d[1], x19\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x12, x12, #0x4\n" + "ldr d4, [x24, #0x0]\n" + "cmp x12, #0x8\n" + "ldr x21, [x24, #0x8]\n" + "add x16, x16, #0x40\n" + "ldr d8, [x16, #0x0]\n" + "ldr x26, [x16, #0x8]\n" + "mov v4.d[1], x21\n" + "ldr d5, [x23, #0x0]\n" + "ldr x19, [x23, #0x8]\n" + "mov v8.d[1], x26\n" + "ldr d6, [x22, #0x0]\n" + "ldr x21, [x22, #0x8]\n" + "mov v5.d[1], x19\n" + "mov v6.d[1], x21\n" + "bge 138b\n" + "139:" // Height 7: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x16, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "sub x12, x12, #0x4\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x11, x11, #0x10\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" + "add x9, x9, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "add x25, x25, #0x10\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "add x24, x24, #0x10\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x23, x23, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "add x22, x22, #0x10\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "add x16, x16, #0x40\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "140:" // Height 7: Multiply loop: Main loop skip + "cbz x12, 142f\n" + "141:" // Height 7: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x25], #0x4\n" + "ldr s4, [x24], #0x4\n" + "ldr s5, [x23], #0x4\n" + "ldr s6, [x22], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "fmla v29.4s, v12.4s, v5.s[0]\n" + "fmla v30.4s, v12.4s, v6.s[0]\n" + "cbnz x12, 141b\n" + "142:" // Height 7: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 135b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add x26, x27, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 143f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "143:" // Height 7: No activation + "cmp x17, #0x4\n" + "bge 146f\n" + "tbz x17, #1, 144f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "str d26, [x26], #0x8\n" + "str d27, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "str d29, [x23], #0x8\n" + "str d30, [x22], #0x8\n" + "tbz x17, #0, 145f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "st1 { v26.s }[2], [x26]\n" + "st1 { v27.s }[2], [x25]\n" + "st1 { v28.s }[2], [x24]\n" + "st1 { v29.s }[2], [x23]\n" + "st1 { v30.s }[2], [x22]\n" + "b 145f\n" + "144:" // Height 7: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "str s26, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "str s28, [x24, #0x0]\n" + "str s29, [x23, #0x0]\n" + "str s30, [x22, #0x0]\n" + "145:" // Height 7: Partial direct writeback: Done + "b 147f\n" + "146:" // Height 7: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "str q26, [x26, #0x0]\n" + "str q27, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "str q29, [x23, #0x0]\n" + "str q30, [x22, #0x0]\n" + "147:" // Height 7: Writeback done + "subs x17, x17, #0x4\n" + "bgt 128b\n" + "b 170f\n" + "148:" // Height 8 + "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" + "mov x15, %x[bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x14, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x20\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "149:" // Height 8: Column loop + "cbz x15, 150f\n" + "ldr q24, [x15, #0x0]\n" + "add x15, x15, #0x10\n" + "mov v25.16b, v24.16b\n" + "mov v26.16b, v24.16b\n" + "mov v27.16b, v24.16b\n" + "mov v28.16b, v24.16b\n" + "mov v29.16b, v24.16b\n" + "mov v30.16b, v24.16b\n" + "mov v31.16b, v24.16b\n" + "b 155f\n" + "150:" // Height 8: no bias + "tbz %x[flags], #0, 154f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x17, #0x4\n" + "add x27, x14, x19, LSL #2\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 153f\n" + "tbz x17, #1, 151f\n" + "ldr d24, [x14], #0x8\n" + "ldr d25, [x27], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d28, [x24], #0x8\n" + "ldr d29, [x23], #0x8\n" + "ldr d30, [x22], #0x8\n" + "ldr d31, [x21], #0x8\n" + "tbz x17, #0, 152f\n" + "ld1 { v24.s }[2], [x14]\n" + "ld1 { v25.s }[2], [x27]\n" + "ld1 { v26.s }[2], [x26]\n" + "ld1 { v27.s }[2], [x25]\n" + "ld1 { v28.s }[2], [x24]\n" + "ld1 { v29.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x22]\n" + "ld1 { v31.s }[2], [x21]\n" + "b 152f\n" + "151:" // Height 8: Partial accumulate: partial_1_0 + "ldr s24, [x14, #0x0]\n" + "mov x19, #0x0\n" + "ldr s25, [x27, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "ldr s27, [x25, #0x0]\n" + "ldr s28, [x24, #0x0]\n" + "ldr s29, [x23, #0x0]\n" + "ldr s30, [x22, #0x0]\n" + "ldr s31, [x21, #0x0]\n" + "152:" // Height 8: Partial accumulate: Done + "sub x14, x14, x19\n" + "b 155f\n" + "153:" // Height 8: full accumulate + "ldr q24, [x14, #0x0]\n" + "ldr q25, [x27, #0x0]\n" + "ldr q26, [x26, #0x0]\n" + "ldr q27, [x25, #0x0]\n" + "ldr q28, [x24, #0x0]\n" + "ldr q29, [x23, #0x0]\n" + "ldr q30, [x22, #0x0]\n" + "ldr q31, [x21, #0x0]\n" + "b 155f\n" + "154:" // Height 8: no accumulate + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "155:" // Height 8: setup done + "mov x13, #0x0\n" + "156:" // Height 8: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w12, [x20, x13, LSL #0x2]\n" + "tbz %x[flags], #3, 157f\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x9, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "ldr x20, [x20, #0x38]\n" + "cbnz x13, 158f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x11, x11, x19, LSL #2\n" + "add x9, x9, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "add x20, x20, x19, LSL #2\n" + "b 158f\n" + "157:" // Height 8: setup direct input + "mov x11, %x[input_ptr]\n" + "add x9, x11, x19, LSL #2\n" + "add x27, x9, x19, LSL #2\n" + "add x25, x27, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x20, x22, x19, LSL #2\n" + "158:" // Height 8: input setup done + "cmp x12, #0x4\n" + "blt 161f\n" + "ldr q0, [x11, #0x0]\n" + "ldr q1, [x9, #0x0]\n" + "cmp x12, #0x8\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x25, #0x0]\n" + "ldr q4, [x24, #0x0]\n" + "ldr q5, [x23, #0x0]\n" + "ldr q6, [x22, #0x0]\n" + "ldr q7, [x20, #0x0]\n" + "ldr q8, [x16, #0x0]\n" + "blt 160f\n" + "159:" // Height 8: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr d9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr x19, [x16, #0x18]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr d10, [x16, #0x20]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "ldr x21, [x16, #0x28]\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "mov v9.d[1], x19\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "ldr d11, [x16, #0x30]\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" + "mov v10.d[1], x21\n" + "fmla v31.4s, v8.4s, v7.s[0]\n" + "ldr x19, [x16, #0x38]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "add x11, x11, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "mov v11.d[1], x19\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr x10, [x11, #0x8]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "add x9, x9, #0x10\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "ldr x28, [x9, #0x8]\n" + "fmla v31.4s, v9.4s, v7.s[1]\n" + "add x27, x27, #0x10\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "ldr x26, [x27, #0x8]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "add x25, x25, #0x10\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "ldr x19, [x25, #0x8]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "add x24, x24, #0x10\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v31.4s, v10.4s, v7.s[2]\n" + "ldr x21, [x24, #0x8]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr d0, [x11, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr d1, [x9, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr d2, [x27, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "mov v0.d[1], x10\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "mov v1.d[1], x28\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "mov v2.d[1], x26\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "ldr d3, [x25, #0x0]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "ldr d4, [x24, #0x0]\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "mov v3.d[1], x19\n" + "prfm pldl1keep, [x23, #0x80]\n" + "mov v4.d[1], x21\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d5, [x23, #0x0]\n" + "add x20, x20, #0x10\n" + "prfm pldl1keep, [x20, #0x80]\n" + "sub x12, x12, #0x4\n" + "ldr x19, [x23, #0x8]\n" + "cmp x12, #0x8\n" + "ldr d6, [x22, #0x0]\n" + "add x16, x16, #0x40\n" + "ldr d8, [x16, #0x0]\n" + "mov v5.d[1], x19\n" + "ldr x26, [x16, #0x8]\n" + "ldr x21, [x22, #0x8]\n" + "ldr d7, [x20, #0x0]\n" + "mov v8.d[1], x26\n" + "ldr x19, [x20, #0x8]\n" + "mov v6.d[1], x21\n" + "mov v7.d[1], x19\n" + "bge 159b\n" + "160:" // Height 8: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x16, #0x10]\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x16, #0x20]\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x16, #0x30]\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "sub x12, x12, #0x4\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" + "add x11, x11, #0x10\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" + "add x9, x9, #0x10\n" + "fmla v31.4s, v8.4s, v7.s[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "add x27, x27, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "add x25, x25, #0x10\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "add x24, x24, #0x10\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "add x23, x23, #0x10\n" + "fmla v31.4s, v9.4s, v7.s[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "add x22, x22, #0x10\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "add x20, x20, #0x10\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "add x16, x16, #0x40\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "fmla v31.4s, v10.4s, v7.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "161:" // Height 8: Multiply loop: Main loop skip + "cbz x12, 163f\n" + "162:" // Height 8: Multiply loop: Odd block loop + "ldr s0, [x11], #0x4\n" + "sub x12, x12, #0x1\n" + "ldr s1, [x9], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x25], #0x4\n" + "ldr s4, [x24], #0x4\n" + "ldr s5, [x23], #0x4\n" + "ldr s6, [x22], #0x4\n" + "ldr s7, [x20], #0x4\n" + "ldr q12, [x16, #0x0]\n" + "add x16, x16, #0x10\n" + "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v25.4s, v12.4s, v1.s[0]\n" + "fmla v26.4s, v12.4s, v2.s[0]\n" + "fmla v27.4s, v12.4s, v3.s[0]\n" + "fmla v28.4s, v12.4s, v4.s[0]\n" + "fmla v29.4s, v12.4s, v5.s[0]\n" + "fmla v30.4s, v12.4s, v6.s[0]\n" + "fmla v31.4s, v12.4s, v7.s[0]\n" + "cbnz x12, 162b\n" + "163:" // Height 8: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x13, x13, #0x1\n" + "cmp x13, x19\n" + "bne 156b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x14, #0x0]\n" + "add x27, x14, x19, LSL #2\n" + "prfm pstl1keep, [x27, #0x0]\n" + "add x26, x27, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 164f\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "add x19, %x[args_ptr], %[offset_max]\n" + "ld1r { v17.4s }, [x20]\n" + "ld1r { v16.4s }, [x19]\n" + "fmin v24.4s, v24.4s, v16.4s\n" + "fmin v25.4s, v25.4s, v16.4s\n" + "fmin v26.4s, v26.4s, v16.4s\n" + "fmin v27.4s, v27.4s, v16.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmin v31.4s, v31.4s, v16.4s\n" + "fmax v24.4s, v24.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "fmax v31.4s, v31.4s, v17.4s\n" + "164:" // Height 8: No activation + "cmp x17, #0x4\n" + "bge 167f\n" + "tbz x17, #1, 165f\n" + "str d24, [x14], #0x8\n" + "str d25, [x27], #0x8\n" + "str d26, [x26], #0x8\n" + "str d27, [x25], #0x8\n" + "str d28, [x24], #0x8\n" + "str d29, [x23], #0x8\n" + "str d30, [x22], #0x8\n" + "str d31, [x21], #0x8\n" + "tbz x17, #0, 166f\n" + "st1 { v24.s }[2], [x14]\n" + "st1 { v25.s }[2], [x27]\n" + "st1 { v26.s }[2], [x26]\n" + "st1 { v27.s }[2], [x25]\n" + "st1 { v28.s }[2], [x24]\n" + "st1 { v29.s }[2], [x23]\n" + "st1 { v30.s }[2], [x22]\n" + "st1 { v31.s }[2], [x21]\n" + "b 166f\n" + "165:" // Height 8: Partial direct writeback: partial_1_0 + "str s24, [x14, #0x0]\n" + "str s25, [x27, #0x0]\n" + "str s26, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "str s28, [x24, #0x0]\n" + "str s29, [x23, #0x0]\n" + "str s30, [x22, #0x0]\n" + "str s31, [x21, #0x0]\n" + "166:" // Height 8: Partial direct writeback: Done + "b 168f\n" + "167:" // Height 8: Full writeback + "str q24, [x14, #0x0]\n" + "add x14, x14, #0x10\n" + "str q25, [x27, #0x0]\n" + "str q26, [x26, #0x0]\n" + "str q27, [x25, #0x0]\n" + "str q28, [x24, #0x0]\n" + "str q29, [x23, #0x0]\n" + "str q30, [x22, #0x0]\n" + "str q31, [x21, #0x0]\n" + "168:" // Height 8: Writeback done + "subs x17, x17, #0x4\n" + "bgt 149b\n" + "subs %x[M], %x[M], #0x8\n" + "beq 170f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 169f\n" + "add x20, x20, #0x8\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "169:" // Update direct input + "mov x19, #0x20\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "170:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp index 7f3fc898f5..9bed0213da 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp @@ -95,278 +95,266 @@ void a64_hybrid_fp32_mla_8x4 ( "1:" // Row loop "cmp %x[M], #0x8\n" - "bge 155f\n" + "bge 148f\n" "cmp %x[M], #0x6\n" - "bgt 133f\n" - "beq 111f\n" + "bgt 127f\n" + "beq 106f\n" "cmp %x[M], #0x4\n" - "bgt 89f\n" - "beq 67f\n" + "bgt 85f\n" + "beq 64f\n" "cmp %x[M], #0x2\n" - "bgt 45f\n" - "beq 23f\n" - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x8, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x17, %x[output_ptr]\n" - "3:" // Height 1: Column loop - "cbz x8, 4f\n" - "ldr q24, [x8, #0x0]\n" - "add x8, x8, #0x10\n" - "b 9f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 8f\n" - "cmp x6, #0x4\n" - "bge 7f\n" - "tbz x6, #1, 5f\n" - "ldr d24, [x17], #0x8\n" + "bgt 43f\n" + "beq 22f\n" + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x11, %x[bias]\n" + "mov x10, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "cbz x11, 3f\n" + "ldr q24, [x11, #0x0]\n" + "add x11, x11, #0x10\n" + "b 8f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 7f\n" + "cmp x13, #0x4\n" + "bge 6f\n" + "tbz x13, #1, 4f\n" + "ldr d24, [x10], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 6f\n" - "ld1 { v24.s }[2], [x17]\n" - "b 6f\n" - "5:" // Height 1: Partial accumulate: partial_1_0 + "tbz x13, #0, 5f\n" + "ld1 { v24.s }[2], [x10]\n" + "b 5f\n" + "4:" // Height 1: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "6:" // Height 1: Partial accumulate: Done - "sub x17, x17, x19\n" - "b 9f\n" - "7:" // Height 1: full accumulate - "ldr q24, [x17, #0x0]\n" - "b 9f\n" - "8:" // Height 1: no accumulate + "5:" // Height 1: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 8f\n" + "6:" // Height 1: full accumulate + "ldr q24, [x10, #0x0]\n" + "b 8f\n" + "7:" // Height 1: no accumulate "movi v24.16b, #0x0\n" - "9:" // Height 1: setup done - "mov x16, #0x0\n" - "10:" // Height 1: String loop + "8:" // Height 1: setup done + "mov x9, #0x0\n" + "9:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 11f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 10f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "cbnz x16, 12f\n" + "ldr x27, [x20, #0x0]\n" + "cbnz x9, 11f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "b 12f\n" - "11:" // Height 1: setup direct input - "mov x14, %x[input_ptr]\n" - "12:" // Height 1: input setup done - "cmp x15, #0x4\n" - "blt 15f\n" - "cmp x15, #0x8\n" + "add x27, x27, x19, LSL #2\n" + "b 11f\n" + "10:" // Height 1: setup direct input + "mov x27, %x[input_ptr]\n" + "11:" // Height 1: input setup done + "cmp x28, #0x4\n" "blt 14f\n" - "13:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "ldr q0, [x27, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "cmp x28, #0x8\n" + "blt 13f\n" + "12:" // Height 1: Multiply loop: Main loop head + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "ldr q10, [x12, #0x20]\n" + "sub x28, x28, #0x4\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "ldr q11, [x12, #0x30]\n" + "cmp x28, #0x8\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x12, x12, #0x40\n" + "ldr q0, [x27, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "bge 12b\n" + "13:" // Height 1: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "ldr q11, [x7, #0x30]\n" - "add x14, x14, #0x10\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "sub x15, x15, #0x4\n" + "ldr q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "cmp x15, #0x8\n" - "add x7, x7, #0x40\n" - "bge 13b\n" - "14:" // Height 1: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "14:" // Height 1: Multiply loop: Main loop skip + "cbz x28, 16f\n" + "15:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" - "ldr q14, [x7, #0x20]\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "ldr q15, [x7, #0x30]\n" - "add x14, x14, #0x10\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "15:" // Height 1: Multiply loop: Main loop skip - "cbz x15, 17f\n" - "16:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "add x7, x7, #0x10\n" - "cbnz x15, 16b\n" - "17:" // Height 1: Multiply loop: No odd multiplies + "add x12, x12, #0x10\n" + "cbnz x28, 15b\n" + "16:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 10b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "tbz %x[flags], #1, 18f\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 9b\n" + "prfm pstl1keep, [x10, #0x0]\n" + "tbz %x[flags], #1, 17f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" "ld1r { v16.4s }, [x19]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmax v24.4s, v24.4s, v17.4s\n" - "18:" // Height 1: No activation - "cmp x6, #0x4\n" - "bge 21f\n" - "tbz x6, #1, 19f\n" - "str d24, [x17], #0x8\n" - "tbz x6, #0, 20f\n" - "st1 { v24.s }[2], [x17]\n" - "b 20f\n" - "19:" // Height 1: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "20:" // Height 1: Partial direct writeback: Done - "b 22f\n" - "21:" // Height 1: Full writeback - "str q24, [x17, #0x0]\n" - "add x17, x17, #0x10\n" - "22:" // Height 1: Writeback done - "subs x6, x6, #0x4\n" - "bgt 3b\n" - "b 178f\n" - "23:" // Height 2 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 24f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "add x13, x13, x19, LSL #2\n" - "b 25f\n" - "24:" // Height 2: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "25:" // Height 2: Column loop - "cbz x8, 26f\n" - "ldr q24, [x8, #0x0]\n" + "17:" // Height 1: No activation + "cmp x13, #0x4\n" + "bge 20f\n" + "tbz x13, #1, 18f\n" + "str d24, [x10], #0x8\n" + "tbz x13, #0, 19f\n" + "st1 { v24.s }[2], [x10]\n" + "b 19f\n" + "18:" // Height 1: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "19:" // Height 1: Partial direct writeback: Done + "b 21f\n" + "20:" // Height 1: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "21:" // Height 1: Writeback done + "subs x13, x13, #0x4\n" + "bgt 2b\n" + "b 170f\n" + "22:" // Height 2 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "23:" // Height 2: Column loop + "cbz x11, 24f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" - "b 31f\n" - "26:" // Height 2: no bias - "tbz %x[flags], #0, 30f\n" - "cmp x6, #0x4\n" - "bge 29f\n" - "tbz x6, #1, 27f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" + "add x11, x11, #0x10\n" + "b 29f\n" + "24:" // Height 2: no bias + "tbz %x[flags], #0, 28f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "bge 27f\n" + "tbz x13, #1, 25f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 28f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "b 28f\n" - "27:" // Height 2: Partial accumulate: partial_1_0 + "tbz x13, #0, 26f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "b 26f\n" + "25:" // Height 2: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "28:" // Height 2: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "b 31f\n" - "29:" // Height 2: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "b 31f\n" - "30:" // Height 2: no accumulate + "ldr s25, [x26, #0x0]\n" + "26:" // Height 2: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 29f\n" + "27:" // Height 2: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "b 29f\n" + "28:" // Height 2: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" - "31:" // Height 2: setup done - "mov x16, #0x0\n" - "32:" // Height 2: String loop + "29:" // Height 2: setup done + "mov x9, #0x0\n" + "30:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 33f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "cbnz x16, 34f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "cbnz x9, 32f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "b 34f\n" - "33:" // Height 2: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "34:" // Height 2: input setup done - "cmp x15, #0x4\n" - "blt 37f\n" - "cmp x15, #0x8\n" - "blt 36f\n" - "35:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 32f\n" + "31:" // Height 2: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "32:" // Height 2: input setup done + "cmp x28, #0x4\n" + "blt 35f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q8, [x12, #0x0]\n" + "blt 34f\n" + "33:" // Height 2: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" - "ldr q11, [x7, #0x30]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" + "ldr q11, [x12, #0x30]\n" + "sub x28, x28, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "add x12, x12, #0x10\n" + "prfm pldl1keep, [x27, #0x80]\n" + "cmp x28, #0x8\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "ldr q8, [x12, #0x0]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" + "bge 33b\n" + "34:" // Height 2: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x12, x12, #0x40\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "sub x15, x15, #0x4\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "cmp x15, #0x8\n" - "add x7, x7, #0x40\n" "fmla v24.4s, v11.4s, v0.s[3]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "bge 35b\n" - "36:" // Height 2: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "35:" // Height 2: Multiply loop: Main loop skip + "cbz x28, 37f\n" + "36:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" - "ldr q15, [x7, #0x30]\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" "add x12, x12, #0x10\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "37:" // Height 2: Multiply loop: Main loop skip - "cbz x15, 39f\n" - "38:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "cbnz x15, 38b\n" - "39:" // Height 2: Multiply loop: No odd multiplies + "fmla v25.4s, v12.4s, v1.s[0]\n" + "cbnz x28, 36b\n" + "37:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 32b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "tbz %x[flags], #1, 40f\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 30b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbz %x[flags], #1, 38f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -375,194 +363,185 @@ void a64_hybrid_fp32_mla_8x4 ( "fmin v25.4s, v25.4s, v16.4s\n" "fmax v24.4s, v24.4s, v17.4s\n" "fmax v25.4s, v25.4s, v17.4s\n" - "40:" // Height 2: No activation - "cmp x6, #0x4\n" - "bge 43f\n" - "tbz x6, #1, 41f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "tbz x6, #0, 42f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" + "38:" // Height 2: No activation + "cmp x13, #0x4\n" + "bge 41f\n" + "tbz x13, #1, 39f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "tbz x13, #0, 40f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "b 40f\n" + "39:" // Height 2: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "40:" // Height 2: Partial direct writeback: Done "b 42f\n" - "41:" // Height 2: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "42:" // Height 2: Partial direct writeback: Done - "b 44f\n" - "43:" // Height 2: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "44:" // Height 2: Writeback done - "subs x6, x6, #0x4\n" - "bgt 25b\n" - "b 178f\n" - "45:" // Height 3 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 46f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "add x11, x11, x19, LSL #2\n" - "b 47f\n" - "46:" // Height 3: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "47:" // Height 3: Column loop - "cbz x8, 48f\n" - "ldr q24, [x8, #0x0]\n" + "41:" // Height 2: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "42:" // Height 2: Writeback done + "subs x13, x13, #0x4\n" + "bgt 23b\n" + "b 170f\n" + "43:" // Height 3 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "44:" // Height 3: Column loop + "cbz x11, 45f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" + "add x11, x11, #0x10\n" "mov v26.16b, v24.16b\n" - "b 53f\n" - "48:" // Height 3: no bias - "tbz %x[flags], #0, 52f\n" - "cmp x6, #0x4\n" - "bge 51f\n" - "tbz x6, #1, 49f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" - "ldr d26, [x11], #0x8\n" - "mov x19, #0x8\n" - "tbz x6, #0, 50f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v26.s }[2], [x11]\n" "b 50f\n" - "49:" // Height 3: Partial accumulate: partial_1_0 + "45:" // Height 3: no bias + "tbz %x[flags], #0, 49f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "bge 48f\n" + "tbz x13, #1, 46f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" + "mov x19, #0x8\n" + "ldr d26, [x25], #0x8\n" + "tbz x13, #0, 47f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "b 47f\n" + "46:" // Height 3: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "ldr s26, [x11, #0x0]\n" - "50:" // Height 3: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "sub x11, x11, x19\n" - "b 53f\n" - "51:" // Height 3: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "ldr q26, [x11, #0x0]\n" - "b 53f\n" - "52:" // Height 3: no accumulate + "ldr s25, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "47:" // Height 3: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 50f\n" + "48:" // Height 3: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "ldr q26, [x25, #0x0]\n" + "b 50f\n" + "49:" // Height 3: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" - "53:" // Height 3: setup done - "mov x16, #0x0\n" - "54:" // Height 3: String loop + "50:" // Height 3: setup done + "mov x9, #0x0\n" + "51:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 55f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 52f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "cbnz x16, 56f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "cbnz x9, 53f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "b 56f\n" - "55:" // Height 3: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "56:" // Height 3: input setup done - "cmp x15, #0x4\n" - "blt 59f\n" - "cmp x15, #0x8\n" - "blt 58f\n" - "57:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 53f\n" + "52:" // Height 3: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "53:" // Height 3: input setup done + "cmp x28, #0x4\n" + "blt 56f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q2, [x25, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "blt 55f\n" + "54:" // Height 3: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x7, #0x30]\n" - "add x14, x14, #0x10\n" + "ldr q11, [x12, #0x30]\n" + "add x25, x25, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "add x12, x12, #0x10\n" + "prfm pldl1keep, [x27, #0x80]\n" + "sub x28, x28, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "cmp x28, #0x8\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "sub x15, x15, #0x4\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x12, x12, #0x40\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "cmp x15, #0x8\n" - "add x7, x7, #0x40\n" + "ldr q8, [x12, #0x0]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "bge 57b\n" - "58:" // Height 3: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "ldr q2, [x25, #0x0]\n" + "bge 54b\n" + "55:" // Height 3: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x25, x25, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "56:" // Height 3: Multiply loop: Main loop skip + "cbz x28, 58f\n" + "57:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" + "add x12, x12, #0x10\n" "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" "fmla v26.4s, v12.4s, v2.s[0]\n" - "ldr q15, [x7, #0x30]\n" - "add x14, x14, #0x10\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "add x12, x12, #0x10\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" - "fmla v26.4s, v13.4s, v2.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v26.4s, v14.4s, v2.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "fmla v26.4s, v15.4s, v2.s[3]\n" - "59:" // Height 3: Multiply loop: Main loop skip - "cbz x15, 61f\n" - "60:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr s2, [x10], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "cbnz x15, 60b\n" - "61:" // Height 3: Multiply loop: No odd multiplies + "cbnz x28, 57b\n" + "58:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 54b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x11, #0x0]\n" - "tbz %x[flags], #1, 62f\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 51b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 59f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -573,161 +552,181 @@ void a64_hybrid_fp32_mla_8x4 ( "fmax v24.4s, v24.4s, v17.4s\n" "fmax v25.4s, v25.4s, v17.4s\n" "fmax v26.4s, v26.4s, v17.4s\n" - "62:" // Height 3: No activation - "cmp x6, #0x4\n" - "bge 65f\n" - "tbz x6, #1, 63f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "str d26, [x11], #0x8\n" - "tbz x6, #0, 64f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" - "st1 { v26.s }[2], [x11]\n" - "b 64f\n" - "63:" // Height 3: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "str s26, [x11, #0x0]\n" - "64:" // Height 3: Partial direct writeback: Done - "b 66f\n" - "65:" // Height 3: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "str q26, [x11, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "add x11, x11, #0x10\n" - "66:" // Height 3: Writeback done - "subs x6, x6, #0x4\n" - "bgt 47b\n" - "b 178f\n" - "67:" // Height 4 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 68f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "add x11, x11, x19, LSL #2\n" - "add x9, x9, x19, LSL #2\n" - "b 69f\n" - "68:" // Height 4: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "69:" // Height 4: Column loop - "cbz x8, 70f\n" - "ldr q24, [x8, #0x0]\n" + "59:" // Height 3: No activation + "cmp x13, #0x4\n" + "bge 62f\n" + "tbz x13, #1, 60f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "str d26, [x25], #0x8\n" + "tbz x13, #0, 61f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v26.s }[2], [x25]\n" + "b 61f\n" + "60:" // Height 3: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "str s26, [x25, #0x0]\n" + "61:" // Height 3: Partial direct writeback: Done + "b 63f\n" + "62:" // Height 3: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "str q26, [x25, #0x0]\n" + "63:" // Height 3: Writeback done + "subs x13, x13, #0x4\n" + "bgt 44b\n" + "b 170f\n" + "64:" // Height 4 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "65:" // Height 4: Column loop + "cbz x11, 66f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" + "add x11, x11, #0x10\n" "mov v26.16b, v24.16b\n" "mov v27.16b, v24.16b\n" - "b 75f\n" - "70:" // Height 4: no bias - "tbz %x[flags], #0, 74f\n" - "cmp x6, #0x4\n" - "bge 73f\n" - "tbz x6, #1, 71f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" - "ldr d26, [x11], #0x8\n" - "ldr d27, [x9], #0x8\n" + "b 71f\n" + "66:" // Height 4: no bias + "tbz %x[flags], #0, 70f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "bge 69f\n" + "tbz x13, #1, 67f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 72f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v26.s }[2], [x11]\n" - "ld1 { v27.s }[2], [x9]\n" - "b 72f\n" - "71:" // Height 4: Partial accumulate: partial_1_0 + "ldr d26, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "tbz x13, #0, 68f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x24]\n" + "b 68f\n" + "67:" // Height 4: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "ldr s26, [x11, #0x0]\n" - "ldr s27, [x9, #0x0]\n" - "72:" // Height 4: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "sub x11, x11, x19\n" - "sub x9, x9, x19\n" - "b 75f\n" - "73:" // Height 4: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "ldr q26, [x11, #0x0]\n" - "ldr q27, [x9, #0x0]\n" - "b 75f\n" - "74:" // Height 4: no accumulate + "ldr s25, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "ldr s27, [x24, #0x0]\n" + "68:" // Height 4: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 71f\n" + "69:" // Height 4: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "ldr q26, [x25, #0x0]\n" + "ldr q27, [x24, #0x0]\n" + "b 71f\n" + "70:" // Height 4: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" - "75:" // Height 4: setup done - "mov x16, #0x0\n" - "76:" // Height 4: String loop + "71:" // Height 4: setup done + "mov x9, #0x0\n" + "72:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 77f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 73f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "cbnz x16, 78f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x9, 74f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "b 78f\n" - "77:" // Height 4: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "78:" // Height 4: input setup done - "cmp x15, #0x4\n" - "blt 81f\n" - "cmp x15, #0x8\n" - "blt 80f\n" - "79:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 74f\n" + "73:" // Height 4: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "74:" // Height 4: input setup done + "cmp x28, #0x4\n" + "blt 77f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q2, [x25, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "blt 76f\n" + "75:" // Height 4: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x7, #0x30]\n" + "ldr q11, [x12, #0x30]\n" + "add x25, x25, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "add x12, x12, #0x10\n" - "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sub x28, x28, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "add x10, x10, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "cmp x28, #0x8\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr q8, [x12, #0x0]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr q2, [x25, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "ldr q3, [x24, #0x0]\n" + "bge 75b\n" + "76:" // Height 4: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x25, x25, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x24, x24, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x12, x12, #0x40\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "sub x15, x15, #0x4\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "cmp x15, #0x8\n" - "add x7, x7, #0x40\n" "fmla v25.4s, v10.4s, v1.s[2]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" "fmla v27.4s, v10.4s, v3.s[2]\n" @@ -735,67 +734,35 @@ void a64_hybrid_fp32_mla_8x4 ( "fmla v25.4s, v11.4s, v1.s[3]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" - "bge 79b\n" - "80:" // Height 4: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "77:" // Height 4: Multiply loop: Main loop skip + "cbz x28, 79f\n" + "78:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" + "add x12, x12, #0x10\n" "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" "fmla v26.4s, v12.4s, v2.s[0]\n" - "ldr q15, [x7, #0x30]\n" "fmla v27.4s, v12.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "add x12, x12, #0x10\n" - "prfm pldl1keep, [x12, #0x80]\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" - "add x10, x10, #0x10\n" - "fmla v26.4s, v13.4s, v2.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" - "fmla v27.4s, v13.4s, v3.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v26.4s, v14.4s, v2.s[2]\n" - "fmla v27.4s, v14.4s, v3.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "fmla v26.4s, v15.4s, v2.s[3]\n" - "fmla v27.4s, v15.4s, v3.s[3]\n" - "81:" // Height 4: Multiply loop: Main loop skip - "cbz x15, 83f\n" - "82:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr s2, [x10], #0x4\n" - "ldr s3, [x28], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "cbnz x15, 82b\n" - "83:" // Height 4: Multiply loop: No odd multiplies + "cbnz x28, 78b\n" + "79:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 76b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbz %x[flags], #1, 84f\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 72b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 80f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -808,184 +775,207 @@ void a64_hybrid_fp32_mla_8x4 ( "fmax v25.4s, v25.4s, v17.4s\n" "fmax v26.4s, v26.4s, v17.4s\n" "fmax v27.4s, v27.4s, v17.4s\n" - "84:" // Height 4: No activation - "cmp x6, #0x4\n" - "bge 87f\n" - "tbz x6, #1, 85f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "str d26, [x11], #0x8\n" - "str d27, [x9], #0x8\n" - "tbz x6, #0, 86f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" - "st1 { v26.s }[2], [x11]\n" - "st1 { v27.s }[2], [x9]\n" - "b 86f\n" - "85:" // Height 4: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "str s26, [x11, #0x0]\n" - "str s27, [x9, #0x0]\n" - "86:" // Height 4: Partial direct writeback: Done - "b 88f\n" - "87:" // Height 4: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "str q26, [x11, #0x0]\n" - "str q27, [x9, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "add x11, x11, #0x10\n" - "add x9, x9, #0x10\n" - "88:" // Height 4: Writeback done - "subs x6, x6, #0x4\n" - "bgt 69b\n" - "b 178f\n" - "89:" // Height 5 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 90f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 91f\n" - "90:" // Height 5: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "91:" // Height 5: Column loop - "cbz x8, 92f\n" - "ldr q24, [x8, #0x0]\n" + "80:" // Height 4: No activation + "cmp x13, #0x4\n" + "bge 83f\n" + "tbz x13, #1, 81f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "str d26, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "tbz x13, #0, 82f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v26.s }[2], [x25]\n" + "st1 { v27.s }[2], [x24]\n" + "b 82f\n" + "81:" // Height 4: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "str s26, [x25, #0x0]\n" + "str s27, [x24, #0x0]\n" + "82:" // Height 4: Partial direct writeback: Done + "b 84f\n" + "83:" // Height 4: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "str q26, [x25, #0x0]\n" + "str q27, [x24, #0x0]\n" + "84:" // Height 4: Writeback done + "subs x13, x13, #0x4\n" + "bgt 65b\n" + "b 170f\n" + "85:" // Height 5 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "86:" // Height 5: Column loop + "cbz x11, 87f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" + "add x11, x11, #0x10\n" "mov v26.16b, v24.16b\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" - "b 97f\n" - "92:" // Height 5: no bias - "tbz %x[flags], #0, 96f\n" - "cmp x6, #0x4\n" - "bge 95f\n" - "tbz x6, #1, 93f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" - "ldr d26, [x11], #0x8\n" - "ldr d27, [x9], #0x8\n" - "ldr d28, [x27], #0x8\n" + "b 92f\n" + "87:" // Height 5: no bias + "tbz %x[flags], #0, 91f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "bge 90f\n" + "tbz x13, #1, 88f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 94f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v26.s }[2], [x11]\n" - "ld1 { v27.s }[2], [x9]\n" - "ld1 { v28.s }[2], [x27]\n" - "b 94f\n" - "93:" // Height 5: Partial accumulate: partial_1_0 + "ldr d26, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "tbz x13, #0, 89f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x24]\n" + "ld1 { v28.s }[2], [x23]\n" + "b 89f\n" + "88:" // Height 5: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "ldr s26, [x11, #0x0]\n" - "ldr s27, [x9, #0x0]\n" - "ldr s28, [x27, #0x0]\n" - "94:" // Height 5: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "sub x11, x11, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "b 97f\n" - "95:" // Height 5: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "ldr q26, [x11, #0x0]\n" - "ldr q27, [x9, #0x0]\n" - "ldr q28, [x27, #0x0]\n" - "b 97f\n" - "96:" // Height 5: no accumulate + "ldr s25, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "ldr s27, [x24, #0x0]\n" + "ldr s28, [x23, #0x0]\n" + "89:" // Height 5: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 92f\n" + "90:" // Height 5: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "ldr q26, [x25, #0x0]\n" + "ldr q27, [x24, #0x0]\n" + "ldr q28, [x23, #0x0]\n" + "b 92f\n" + "91:" // Height 5: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" "movi v28.16b, #0x0\n" - "97:" // Height 5: setup done - "mov x16, #0x0\n" - "98:" // Height 5: String loop + "92:" // Height 5: setup done + "mov x9, #0x0\n" + "93:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 99f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 94f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "cbnz x16, 100f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "cbnz x9, 95f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" - "b 100f\n" - "99:" // Height 5: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "100:" // Height 5: input setup done - "cmp x15, #0x4\n" - "blt 103f\n" - "cmp x15, #0x8\n" - "blt 102f\n" - "101:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 95f\n" + "94:" // Height 5: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "95:" // Height 5: input setup done + "cmp x28, #0x4\n" + "blt 98f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q2, [x25, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "blt 97f\n" + "96:" // Height 5: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x7, #0x30]\n" + "ldr q11, [x12, #0x30]\n" + "add x25, x25, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x28, x28, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x28, #0x8\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x12, x12, #0x40\n" "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr q8, [x12, #0x0]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr q2, [x25, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "ldr q3, [x24, #0x0]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "ldr q4, [x23, #0x0]\n" + "bge 96b\n" + "97:" // Height 5: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x25, x25, #0x10\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" "prfm pldl1keep, [x26, #0x80]\n" - "sub x15, x15, #0x4\n" + "add x24, x24, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "cmp x15, #0x8\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x7, x7, #0x40\n" "fmla v25.4s, v10.4s, v1.s[2]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" "fmla v27.4s, v10.4s, v3.s[2]\n" @@ -995,77 +985,39 @@ void a64_hybrid_fp32_mla_8x4 ( "fmla v26.4s, v11.4s, v2.s[3]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" - "bge 101b\n" - "102:" // Height 5: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "98:" // Height 5: Multiply loop: Main loop skip + "cbz x28, 100f\n" + "99:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" + "add x12, x12, #0x10\n" "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" "fmla v26.4s, v12.4s, v2.s[0]\n" - "ldr q15, [x7, #0x30]\n" "fmla v27.4s, v12.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" "fmla v28.4s, v12.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" - "fmla v26.4s, v13.4s, v2.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - "fmla v27.4s, v13.4s, v3.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v28.4s, v13.4s, v4.s[1]\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v26.4s, v14.4s, v2.s[2]\n" - "fmla v27.4s, v14.4s, v3.s[2]\n" - "fmla v28.4s, v14.4s, v4.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "fmla v26.4s, v15.4s, v2.s[3]\n" - "fmla v27.4s, v15.4s, v3.s[3]\n" - "fmla v28.4s, v15.4s, v4.s[3]\n" - "103:" // Height 5: Multiply loop: Main loop skip - "cbz x15, 105f\n" - "104:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr s2, [x10], #0x4\n" - "ldr s3, [x28], #0x4\n" - "ldr s4, [x26], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "cbnz x15, 104b\n" - "105:" // Height 5: Multiply loop: No odd multiplies + "cbnz x28, 99b\n" + "100:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 98b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" - "tbz %x[flags], #1, 106f\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 93b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 101f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1080,206 +1032,232 @@ void a64_hybrid_fp32_mla_8x4 ( "fmax v27.4s, v27.4s, v17.4s\n" "fmin v28.4s, v28.4s, v16.4s\n" "fmax v28.4s, v28.4s, v17.4s\n" - "106:" // Height 5: No activation - "cmp x6, #0x4\n" - "bge 109f\n" - "tbz x6, #1, 107f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "str d26, [x11], #0x8\n" - "str d27, [x9], #0x8\n" - "str d28, [x27], #0x8\n" - "tbz x6, #0, 108f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" - "st1 { v26.s }[2], [x11]\n" - "st1 { v27.s }[2], [x9]\n" - "st1 { v28.s }[2], [x27]\n" - "b 108f\n" - "107:" // Height 5: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "str s26, [x11, #0x0]\n" - "str s27, [x9, #0x0]\n" - "str s28, [x27, #0x0]\n" - "108:" // Height 5: Partial direct writeback: Done - "b 110f\n" - "109:" // Height 5: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "str q26, [x11, #0x0]\n" - "str q27, [x9, #0x0]\n" - "str q28, [x27, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "add x11, x11, #0x10\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "110:" // Height 5: Writeback done - "subs x6, x6, #0x4\n" - "bgt 91b\n" - "b 178f\n" - "111:" // Height 6 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 112f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x28]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 113f\n" - "112:" // Height 6: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "113:" // Height 6: Column loop - "cbz x8, 114f\n" - "ldr q24, [x8, #0x0]\n" + "101:" // Height 5: No activation + "cmp x13, #0x4\n" + "bge 104f\n" + "tbz x13, #1, 102f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "str d26, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "tbz x13, #0, 103f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v26.s }[2], [x25]\n" + "st1 { v27.s }[2], [x24]\n" + "st1 { v28.s }[2], [x23]\n" + "b 103f\n" + "102:" // Height 5: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "str s26, [x25, #0x0]\n" + "str s27, [x24, #0x0]\n" + "str s28, [x23, #0x0]\n" + "103:" // Height 5: Partial direct writeback: Done + "b 105f\n" + "104:" // Height 5: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "str q26, [x25, #0x0]\n" + "str q27, [x24, #0x0]\n" + "str q28, [x23, #0x0]\n" + "105:" // Height 5: Writeback done + "subs x13, x13, #0x4\n" + "bgt 86b\n" + "b 170f\n" + "106:" // Height 6 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "107:" // Height 6: Column loop + "cbz x11, 108f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" + "add x11, x11, #0x10\n" "mov v26.16b, v24.16b\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "mov v29.16b, v24.16b\n" - "b 119f\n" - "114:" // Height 6: no bias - "tbz %x[flags], #0, 118f\n" - "cmp x6, #0x4\n" - "bge 117f\n" - "tbz x6, #1, 115f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" - "ldr d26, [x11], #0x8\n" - "ldr d27, [x9], #0x8\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x25], #0x8\n" + "b 113f\n" + "108:" // Height 6: no bias + "tbz %x[flags], #0, 112f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 111f\n" + "tbz x13, #1, 109f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 116f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v26.s }[2], [x11]\n" - "ld1 { v27.s }[2], [x9]\n" - "ld1 { v28.s }[2], [x27]\n" - "ld1 { v29.s }[2], [x25]\n" - "b 116f\n" - "115:" // Height 6: Partial accumulate: partial_1_0 + "ldr d26, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d29, [x22], #0x8\n" + "tbz x13, #0, 110f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x24]\n" + "ld1 { v28.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x22]\n" + "b 110f\n" + "109:" // Height 6: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "ldr s26, [x11, #0x0]\n" - "ldr s27, [x9, #0x0]\n" - "ldr s28, [x27, #0x0]\n" - "ldr s29, [x25, #0x0]\n" - "116:" // Height 6: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "sub x11, x11, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "b 119f\n" - "117:" // Height 6: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "ldr q26, [x11, #0x0]\n" - "ldr q27, [x9, #0x0]\n" - "ldr q28, [x27, #0x0]\n" - "ldr q29, [x25, #0x0]\n" - "b 119f\n" - "118:" // Height 6: no accumulate + "ldr s25, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "ldr s27, [x24, #0x0]\n" + "ldr s28, [x23, #0x0]\n" + "ldr s29, [x22, #0x0]\n" + "110:" // Height 6: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 113f\n" + "111:" // Height 6: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "ldr q26, [x25, #0x0]\n" + "ldr q27, [x24, #0x0]\n" + "ldr q28, [x23, #0x0]\n" + "ldr q29, [x22, #0x0]\n" + "b 113f\n" + "112:" // Height 6: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" "movi v28.16b, #0x0\n" "movi v29.16b, #0x0\n" - "119:" // Height 6: setup done - "mov x16, #0x0\n" - "120:" // Height 6: String loop + "113:" // Height 6: setup done + "mov x9, #0x0\n" + "114:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 121f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 115f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "ldr x24, [x20, #0x28]\n" - "cbnz x16, 122f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "cbnz x9, 116f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" - "b 122f\n" - "121:" // Height 6: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "122:" // Height 6: input setup done - "cmp x15, #0x4\n" - "blt 125f\n" - "cmp x15, #0x8\n" - "blt 124f\n" - "123:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q5, [x24, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 116f\n" + "115:" // Height 6: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "116:" // Height 6: input setup done + "cmp x28, #0x4\n" + "blt 119f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q2, [x25, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q5, [x22, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "blt 118f\n" + "117:" // Height 6: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x7, #0x30]\n" + "ldr q11, [x12, #0x30]\n" + "add x25, x25, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x28, x28, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x28, #0x8\n" "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr q8, [x12, #0x0]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr q2, [x25, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "ldr q3, [x24, #0x0]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "ldr q4, [x23, #0x0]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "ldr q5, [x22, #0x0]\n" + "bge 117b\n" + "118:" // Height 6: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x25, x25, #0x10\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" "prfm pldl1keep, [x24, #0x80]\n" - "sub x15, x15, #0x4\n" + "add x22, x22, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "cmp x15, #0x8\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "add x7, x7, #0x40\n" "fmla v24.4s, v10.4s, v0.s[2]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" @@ -1292,87 +1270,43 @@ void a64_hybrid_fp32_mla_8x4 ( "fmla v27.4s, v11.4s, v3.s[3]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" "fmla v29.4s, v11.4s, v5.s[3]\n" - "bge 123b\n" - "124:" // Height 6: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q5, [x24, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "119:" // Height 6: Multiply loop: Main loop skip + "cbz x28, 121f\n" + "120:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s5, [x22], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" + "add x12, x12, #0x10\n" "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" "fmla v26.4s, v12.4s, v2.s[0]\n" - "ldr q15, [x7, #0x30]\n" "fmla v27.4s, v12.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" "fmla v28.4s, v12.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" "fmla v29.4s, v12.4s, v5.s[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - "fmla v26.4s, v13.4s, v2.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" - "fmla v27.4s, v13.4s, v3.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v28.4s, v13.4s, v4.s[1]\n" - "fmla v29.4s, v13.4s, v5.s[1]\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v26.4s, v14.4s, v2.s[2]\n" - "fmla v27.4s, v14.4s, v3.s[2]\n" - "fmla v28.4s, v14.4s, v4.s[2]\n" - "fmla v29.4s, v14.4s, v5.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "fmla v26.4s, v15.4s, v2.s[3]\n" - "fmla v27.4s, v15.4s, v3.s[3]\n" - "fmla v28.4s, v15.4s, v4.s[3]\n" - "fmla v29.4s, v15.4s, v5.s[3]\n" - "125:" // Height 6: Multiply loop: Main loop skip - "cbz x15, 127f\n" - "126:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr s2, [x10], #0x4\n" - "ldr s3, [x28], #0x4\n" - "ldr s4, [x26], #0x4\n" - "ldr s5, [x24], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "cbnz x15, 126b\n" - "127:" // Height 6: Multiply loop: No odd multiplies + "cbnz x28, 120b\n" + "121:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 120b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 114b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" "prfm pstl1keep, [x25, #0x0]\n" - "tbz %x[flags], #1, 128f\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 122f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1389,140 +1323,112 @@ void a64_hybrid_fp32_mla_8x4 ( "fmin v29.4s, v29.4s, v16.4s\n" "fmax v28.4s, v28.4s, v17.4s\n" "fmax v29.4s, v29.4s, v17.4s\n" - "128:" // Height 6: No activation - "cmp x6, #0x4\n" - "bge 131f\n" - "tbz x6, #1, 129f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "str d26, [x11], #0x8\n" - "str d27, [x9], #0x8\n" - "str d28, [x27], #0x8\n" - "str d29, [x25], #0x8\n" - "tbz x6, #0, 130f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" - "st1 { v26.s }[2], [x11]\n" - "st1 { v27.s }[2], [x9]\n" - "st1 { v28.s }[2], [x27]\n" - "st1 { v29.s }[2], [x25]\n" - "b 130f\n" - "129:" // Height 6: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "str s26, [x11, #0x0]\n" - "str s27, [x9, #0x0]\n" - "str s28, [x27, #0x0]\n" - "str s29, [x25, #0x0]\n" - "130:" // Height 6: Partial direct writeback: Done - "b 132f\n" - "131:" // Height 6: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "str q26, [x11, #0x0]\n" - "str q27, [x9, #0x0]\n" - "str q28, [x27, #0x0]\n" - "str q29, [x25, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "add x11, x11, #0x10\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x25, x25, #0x10\n" - "132:" // Height 6: Writeback done - "subs x6, x6, #0x4\n" - "bgt 113b\n" - "b 178f\n" - "133:" // Height 7 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 134f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x28]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x23, [%x[output_ptr], #0x30]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 135f\n" - "134:" // Height 7: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "135:" // Height 7: Column loop - "cbz x8, 136f\n" - "ldr q24, [x8, #0x0]\n" + "122:" // Height 6: No activation + "cmp x13, #0x4\n" + "bge 125f\n" + "tbz x13, #1, 123f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "str d26, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "str d29, [x22], #0x8\n" + "tbz x13, #0, 124f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v26.s }[2], [x25]\n" + "st1 { v27.s }[2], [x24]\n" + "st1 { v28.s }[2], [x23]\n" + "st1 { v29.s }[2], [x22]\n" + "b 124f\n" + "123:" // Height 6: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "str s26, [x25, #0x0]\n" + "str s27, [x24, #0x0]\n" + "str s28, [x23, #0x0]\n" + "str s29, [x22, #0x0]\n" + "124:" // Height 6: Partial direct writeback: Done + "b 126f\n" + "125:" // Height 6: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "str q26, [x25, #0x0]\n" + "str q27, [x24, #0x0]\n" + "str q28, [x23, #0x0]\n" + "str q29, [x22, #0x0]\n" + "126:" // Height 6: Writeback done + "subs x13, x13, #0x4\n" + "bgt 107b\n" + "b 170f\n" + "127:" // Height 7 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "128:" // Height 7: Column loop + "cbz x11, 129f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" + "add x11, x11, #0x10\n" "mov v26.16b, v24.16b\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "mov v29.16b, v24.16b\n" "mov v30.16b, v24.16b\n" - "b 141f\n" - "136:" // Height 7: no bias - "tbz %x[flags], #0, 140f\n" - "cmp x6, #0x4\n" - "bge 139f\n" - "tbz x6, #1, 137f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" - "ldr d26, [x11], #0x8\n" - "ldr d27, [x9], #0x8\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x25], #0x8\n" - "ldr d30, [x23], #0x8\n" + "b 134f\n" + "129:" // Height 7: no bias + "tbz %x[flags], #0, 133f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 132f\n" + "tbz x13, #1, 130f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 138f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v26.s }[2], [x11]\n" - "ld1 { v27.s }[2], [x9]\n" - "ld1 { v28.s }[2], [x27]\n" - "ld1 { v29.s }[2], [x25]\n" - "ld1 { v30.s }[2], [x23]\n" - "b 138f\n" - "137:" // Height 7: Partial accumulate: partial_1_0 + "ldr d26, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d29, [x22], #0x8\n" + "ldr d30, [x21], #0x8\n" + "tbz x13, #0, 131f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x24]\n" + "ld1 { v28.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x22]\n" + "ld1 { v30.s }[2], [x21]\n" + "b 131f\n" + "130:" // Height 7: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "ldr s26, [x11, #0x0]\n" - "ldr s27, [x9, #0x0]\n" - "ldr s28, [x27, #0x0]\n" - "ldr s29, [x25, #0x0]\n" - "ldr s30, [x23, #0x0]\n" - "138:" // Height 7: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "sub x11, x11, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "b 141f\n" - "139:" // Height 7: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "ldr q26, [x11, #0x0]\n" - "ldr q27, [x9, #0x0]\n" - "ldr q28, [x27, #0x0]\n" - "ldr q29, [x25, #0x0]\n" - "ldr q30, [x23, #0x0]\n" - "b 141f\n" - "140:" // Height 7: no accumulate + "ldr s25, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "ldr s27, [x24, #0x0]\n" + "ldr s28, [x23, #0x0]\n" + "ldr s29, [x22, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "131:" // Height 7: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 134f\n" + "132:" // Height 7: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "ldr q26, [x25, #0x0]\n" + "ldr q27, [x24, #0x0]\n" + "ldr q28, [x23, #0x0]\n" + "ldr q29, [x22, #0x0]\n" + "ldr q30, [x21, #0x0]\n" + "b 134f\n" + "133:" // Height 7: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" @@ -1530,87 +1436,144 @@ void a64_hybrid_fp32_mla_8x4 ( "movi v28.16b, #0x0\n" "movi v29.16b, #0x0\n" "movi v30.16b, #0x0\n" - "141:" // Height 7: setup done - "mov x16, #0x0\n" - "142:" // Height 7: String loop + "134:" // Height 7: setup done + "mov x9, #0x0\n" + "135:" // Height 7: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 143f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 136f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "ldr x24, [x20, #0x28]\n" - "ldr x22, [x20, #0x30]\n" - "cbnz x16, 144f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "ldr x21, [x20, #0x30]\n" + "cbnz x9, 137f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" - "b 144f\n" - "143:" // Height 7: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "144:" // Height 7: input setup done - "cmp x15, #0x4\n" - "blt 147f\n" - "cmp x15, #0x8\n" - "blt 146f\n" - "145:" // Height 7: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q5, [x24, #0x0]\n" - "ldr q6, [x22, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "add x21, x21, x19, LSL #2\n" + "b 137f\n" + "136:" // Height 7: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "137:" // Height 7: input setup done + "cmp x28, #0x4\n" + "blt 140f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q2, [x25, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q5, [x22, #0x0]\n" + "ldr q6, [x21, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "blt 139f\n" + "138:" // Height 7: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x7, #0x30]\n" + "ldr q11, [x12, #0x30]\n" + "add x25, x25, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v30.4s, v8.4s, v6.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x21, x21, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x28, x28, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x28, #0x8\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr q8, [x12, #0x0]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr q2, [x25, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "ldr q3, [x24, #0x0]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "ldr q4, [x23, #0x0]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "ldr q5, [x22, #0x0]\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "ldr q6, [x21, #0x0]\n" + "bge 138b\n" + "139:" // Height 7: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x25, x25, #0x10\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - "fmla v26.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "add x22, x22, #0x10\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x21, x21, #0x10\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" "prfm pldl1keep, [x22, #0x80]\n" - "sub x15, x15, #0x4\n" + "add x12, x12, #0x40\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "cmp x15, #0x8\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "add x7, x7, #0x40\n" "fmla v30.4s, v9.4s, v6.s[1]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" @@ -1626,97 +1589,47 @@ void a64_hybrid_fp32_mla_8x4 ( "fmla v28.4s, v11.4s, v4.s[3]\n" "fmla v29.4s, v11.4s, v5.s[3]\n" "fmla v30.4s, v11.4s, v6.s[3]\n" - "bge 145b\n" - "146:" // Height 7: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q5, [x24, #0x0]\n" - "ldr q6, [x22, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "140:" // Height 7: Multiply loop: Main loop skip + "cbz x28, 142f\n" + "141:" // Height 7: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s5, [x22], #0x4\n" + "ldr s6, [x21], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" + "add x12, x12, #0x10\n" "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" "fmla v26.4s, v12.4s, v2.s[0]\n" - "ldr q15, [x7, #0x30]\n" "fmla v27.4s, v12.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" "fmla v28.4s, v12.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" "fmla v29.4s, v12.4s, v5.s[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" "fmla v30.4s, v12.4s, v6.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" - "fmla v26.4s, v13.4s, v2.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "add x22, x22, #0x10\n" - "fmla v27.4s, v13.4s, v3.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v28.4s, v13.4s, v4.s[1]\n" - "fmla v29.4s, v13.4s, v5.s[1]\n" - "fmla v30.4s, v13.4s, v6.s[1]\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v26.4s, v14.4s, v2.s[2]\n" - "fmla v27.4s, v14.4s, v3.s[2]\n" - "fmla v28.4s, v14.4s, v4.s[2]\n" - "fmla v29.4s, v14.4s, v5.s[2]\n" - "fmla v30.4s, v14.4s, v6.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "fmla v26.4s, v15.4s, v2.s[3]\n" - "fmla v27.4s, v15.4s, v3.s[3]\n" - "fmla v28.4s, v15.4s, v4.s[3]\n" - "fmla v29.4s, v15.4s, v5.s[3]\n" - "fmla v30.4s, v15.4s, v6.s[3]\n" - "147:" // Height 7: Multiply loop: Main loop skip - "cbz x15, 149f\n" - "148:" // Height 7: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr s2, [x10], #0x4\n" - "ldr s3, [x28], #0x4\n" - "ldr s4, [x26], #0x4\n" - "ldr s5, [x24], #0x4\n" - "ldr s6, [x22], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "cbnz x15, 148b\n" - "149:" // Height 7: Multiply loop: No odd multiplies + "cbnz x28, 141b\n" + "142:" // Height 7: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 142b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 135b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" - "tbz %x[flags], #1, 150f\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbz %x[flags], #1, 143f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1735,156 +1648,125 @@ void a64_hybrid_fp32_mla_8x4 ( "fmax v28.4s, v28.4s, v17.4s\n" "fmax v29.4s, v29.4s, v17.4s\n" "fmax v30.4s, v30.4s, v17.4s\n" - "150:" // Height 7: No activation - "cmp x6, #0x4\n" - "bge 153f\n" - "tbz x6, #1, 151f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "str d26, [x11], #0x8\n" - "str d27, [x9], #0x8\n" - "str d28, [x27], #0x8\n" - "str d29, [x25], #0x8\n" - "str d30, [x23], #0x8\n" - "tbz x6, #0, 152f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" - "st1 { v26.s }[2], [x11]\n" - "st1 { v27.s }[2], [x9]\n" - "st1 { v28.s }[2], [x27]\n" - "st1 { v29.s }[2], [x25]\n" - "st1 { v30.s }[2], [x23]\n" - "b 152f\n" - "151:" // Height 7: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "str s26, [x11, #0x0]\n" - "str s27, [x9, #0x0]\n" - "str s28, [x27, #0x0]\n" - "str s29, [x25, #0x0]\n" - "str s30, [x23, #0x0]\n" - "152:" // Height 7: Partial direct writeback: Done - "b 154f\n" - "153:" // Height 7: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "str q26, [x11, #0x0]\n" - "str q27, [x9, #0x0]\n" - "str q28, [x27, #0x0]\n" - "str q29, [x25, #0x0]\n" - "str q30, [x23, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "add x11, x11, #0x10\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "154:" // Height 7: Writeback done - "subs x6, x6, #0x4\n" - "bgt 135b\n" - "b 178f\n" - "155:" // Height 8 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 156f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x28]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x23, [%x[output_ptr], #0x30]\n" - "ldr x21, [%x[output_ptr], #0x38]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add %x[output_ptr], %x[output_ptr], #0x40\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 157f\n" - "156:" // Height 8: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "157:" // Height 8: Column loop - "cbz x8, 158f\n" - "ldr q24, [x8, #0x0]\n" + "143:" // Height 7: No activation + "cmp x13, #0x4\n" + "bge 146f\n" + "tbz x13, #1, 144f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "str d26, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "str d29, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "tbz x13, #0, 145f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v26.s }[2], [x25]\n" + "st1 { v27.s }[2], [x24]\n" + "st1 { v28.s }[2], [x23]\n" + "st1 { v29.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "b 145f\n" + "144:" // Height 7: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "str s26, [x25, #0x0]\n" + "str s27, [x24, #0x0]\n" + "str s28, [x23, #0x0]\n" + "str s29, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "145:" // Height 7: Partial direct writeback: Done + "b 147f\n" + "146:" // Height 7: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "str q26, [x25, #0x0]\n" + "str q27, [x24, #0x0]\n" + "str q28, [x23, #0x0]\n" + "str q29, [x22, #0x0]\n" + "str q30, [x21, #0x0]\n" + "147:" // Height 7: Writeback done + "subs x13, x13, #0x4\n" + "bgt 128b\n" + "b 170f\n" + "148:" // Height 8 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x20\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "149:" // Height 8: Column loop + "cbz x11, 150f\n" + "ldr q24, [x11, #0x0]\n" "mov v25.16b, v24.16b\n" - "add x8, x8, #0x10\n" + "add x11, x11, #0x10\n" "mov v26.16b, v24.16b\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "mov v29.16b, v24.16b\n" "mov v30.16b, v24.16b\n" "mov v31.16b, v24.16b\n" - "b 163f\n" - "158:" // Height 8: no bias - "tbz %x[flags], #0, 162f\n" - "cmp x6, #0x4\n" - "bge 161f\n" - "tbz x6, #1, 159f\n" - "ldr d24, [x17], #0x8\n" - "ldr d25, [x13], #0x8\n" - "ldr d26, [x11], #0x8\n" - "ldr d27, [x9], #0x8\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x25], #0x8\n" - "ldr d30, [x23], #0x8\n" - "ldr d31, [x21], #0x8\n" + "b 155f\n" + "150:" // Height 8: no bias + "tbz %x[flags], #0, 154f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x13, #0x4\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 153f\n" + "tbz x13, #1, 151f\n" + "ldr d24, [x10], #0x8\n" + "ldr d25, [x26], #0x8\n" "mov x19, #0x8\n" - "tbz x6, #0, 160f\n" - "ld1 { v24.s }[2], [x17]\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v26.s }[2], [x11]\n" - "ld1 { v27.s }[2], [x9]\n" - "ld1 { v28.s }[2], [x27]\n" - "ld1 { v29.s }[2], [x25]\n" - "ld1 { v30.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x21]\n" - "b 160f\n" - "159:" // Height 8: Partial accumulate: partial_1_0 + "ldr d26, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d29, [x22], #0x8\n" + "ldr d30, [x21], #0x8\n" + "ldr d31, [x20], #0x8\n" + "tbz x13, #0, 152f\n" + "ld1 { v24.s }[2], [x10]\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v26.s }[2], [x25]\n" + "ld1 { v27.s }[2], [x24]\n" + "ld1 { v28.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x22]\n" + "ld1 { v30.s }[2], [x21]\n" + "ld1 { v31.s }[2], [x20]\n" + "b 152f\n" + "151:" // Height 8: Partial accumulate: partial_1_0 + "ldr s24, [x10, #0x0]\n" "mov x19, #0x0\n" - "ldr s24, [x17, #0x0]\n" - "ldr s25, [x13, #0x0]\n" - "ldr s26, [x11, #0x0]\n" - "ldr s27, [x9, #0x0]\n" - "ldr s28, [x27, #0x0]\n" - "ldr s29, [x25, #0x0]\n" - "ldr s30, [x23, #0x0]\n" - "ldr s31, [x21, #0x0]\n" - "160:" // Height 8: Partial accumulate: Done - "sub x17, x17, x19\n" - "sub x13, x13, x19\n" - "sub x11, x11, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "sub x21, x21, x19\n" - "b 163f\n" - "161:" // Height 8: full accumulate - "ldr q24, [x17, #0x0]\n" - "ldr q25, [x13, #0x0]\n" - "ldr q26, [x11, #0x0]\n" - "ldr q27, [x9, #0x0]\n" - "ldr q28, [x27, #0x0]\n" - "ldr q29, [x25, #0x0]\n" - "ldr q30, [x23, #0x0]\n" - "ldr q31, [x21, #0x0]\n" - "b 163f\n" - "162:" // Height 8: no accumulate + "ldr s25, [x26, #0x0]\n" + "ldr s26, [x25, #0x0]\n" + "ldr s27, [x24, #0x0]\n" + "ldr s28, [x23, #0x0]\n" + "ldr s29, [x22, #0x0]\n" + "ldr s30, [x21, #0x0]\n" + "ldr s31, [x20, #0x0]\n" + "152:" // Height 8: Partial accumulate: Done + "sub x10, x10, x19\n" + "b 155f\n" + "153:" // Height 8: full accumulate + "ldr q24, [x10, #0x0]\n" + "ldr q25, [x26, #0x0]\n" + "ldr q26, [x25, #0x0]\n" + "ldr q27, [x24, #0x0]\n" + "ldr q28, [x23, #0x0]\n" + "ldr q29, [x22, #0x0]\n" + "ldr q30, [x21, #0x0]\n" + "ldr q31, [x20, #0x0]\n" + "b 155f\n" + "154:" // Height 8: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" @@ -1893,94 +1775,158 @@ void a64_hybrid_fp32_mla_8x4 ( "movi v29.16b, #0x0\n" "movi v30.16b, #0x0\n" "movi v31.16b, #0x0\n" - "163:" // Height 8: setup done - "mov x16, #0x0\n" - "164:" // Height 8: String loop + "155:" // Height 8: setup done + "mov x9, #0x0\n" + "156:" // Height 8: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 165f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 157f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "ldr x24, [x20, #0x28]\n" - "ldr x22, [x20, #0x30]\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "ldr x21, [x20, #0x30]\n" "ldr x20, [x20, #0x38]\n" - "cbnz x16, 166f\n" + "cbnz x9, 158f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" "add x20, x20, x19, LSL #2\n" - "b 166f\n" - "165:" // Height 8: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "add x20, x22, x19, LSL #2\n" - "166:" // Height 8: input setup done - "cmp x15, #0x4\n" - "blt 169f\n" - "cmp x15, #0x8\n" - "blt 168f\n" - "167:" // Height 8: Multiply loop: Main loop head - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q5, [x24, #0x0]\n" - "ldr q6, [x22, #0x0]\n" + "b 158f\n" + "157:" // Height 8: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "158:" // Height 8: input setup done + "cmp x28, #0x4\n" + "blt 161f\n" + "ldr q0, [x27, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x28, #0x8\n" + "ldr q2, [x25, #0x0]\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q5, [x22, #0x0]\n" + "ldr q6, [x21, #0x0]\n" "ldr q7, [x20, #0x0]\n" - "ldr q8, [x7, #0x0]\n" + "ldr q8, [x12, #0x0]\n" + "blt 160f\n" + "159:" // Height 8: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x7, #0x10]\n" + "ldr q9, [x12, #0x10]\n" + "add x27, x27, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x7, #0x20]\n" + "ldr q10, [x12, #0x20]\n" + "add x26, x26, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x7, #0x30]\n" + "ldr q11, [x12, #0x30]\n" + "add x25, x25, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x24, x24, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "fmla v30.4s, v8.4s, v6.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x21, x21, #0x10\n" "fmla v31.4s, v8.4s, v7.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x20, x20, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x28, #0x8\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" + "ldr q8, [x12, #0x0]\n" + "fmla v28.4s, v9.4s, v4.s[1]\n" + "fmla v29.4s, v9.4s, v5.s[1]\n" + "fmla v30.4s, v9.4s, v6.s[1]\n" + "fmla v31.4s, v9.4s, v7.s[1]\n" + "fmla v24.4s, v10.4s, v0.s[2]\n" + "fmla v25.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v27.4s, v10.4s, v3.s[2]\n" + "fmla v28.4s, v10.4s, v4.s[2]\n" + "fmla v29.4s, v10.4s, v5.s[2]\n" + "fmla v30.4s, v10.4s, v6.s[2]\n" + "fmla v31.4s, v10.4s, v7.s[2]\n" + "fmla v24.4s, v11.4s, v0.s[3]\n" + "ldr q0, [x27, #0x0]\n" + "fmla v25.4s, v11.4s, v1.s[3]\n" + "ldr q1, [x26, #0x0]\n" + "fmla v26.4s, v11.4s, v2.s[3]\n" + "ldr q2, [x25, #0x0]\n" + "fmla v27.4s, v11.4s, v3.s[3]\n" + "ldr q3, [x24, #0x0]\n" + "fmla v28.4s, v11.4s, v4.s[3]\n" + "ldr q4, [x23, #0x0]\n" + "fmla v29.4s, v11.4s, v5.s[3]\n" + "ldr q5, [x22, #0x0]\n" + "fmla v30.4s, v11.4s, v6.s[3]\n" + "ldr q6, [x21, #0x0]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "ldr q7, [x20, #0x0]\n" + "bge 159b\n" + "160:" // Height 8: Multiply loop: Single iteration only + "fmla v24.4s, v8.4s, v0.s[0]\n" + "ldr q9, [x12, #0x10]\n" + "sub x28, x28, #0x4\n" + "fmla v25.4s, v8.4s, v1.s[0]\n" + "ldr q10, [x12, #0x20]\n" + "add x27, x27, #0x10\n" + "fmla v26.4s, v8.4s, v2.s[0]\n" + "ldr q11, [x12, #0x30]\n" + "add x26, x26, #0x10\n" + "fmla v27.4s, v8.4s, v3.s[0]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "add x25, x25, #0x10\n" + "fmla v28.4s, v8.4s, v4.s[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - "fmla v25.4s, v9.4s, v1.s[1]\n" + "fmla v29.4s, v8.4s, v5.s[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" + "fmla v30.4s, v8.4s, v6.s[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "add x22, x22, #0x10\n" - "fmla v26.4s, v9.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v7.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x21, x21, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" "prfm pldl1keep, [x22, #0x80]\n" "add x20, x20, #0x10\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" + "fmla v25.4s, v9.4s, v1.s[1]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "add x12, x12, #0x40\n" + "fmla v26.4s, v9.4s, v2.s[1]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x15, x15, #0x4\n" + "fmla v27.4s, v9.4s, v3.s[1]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "cmp x15, #0x8\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "add x7, x7, #0x40\n" "fmla v30.4s, v9.4s, v6.s[1]\n" "fmla v31.4s, v9.4s, v7.s[1]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" @@ -1999,107 +1945,51 @@ void a64_hybrid_fp32_mla_8x4 ( "fmla v29.4s, v11.4s, v5.s[3]\n" "fmla v30.4s, v11.4s, v6.s[3]\n" "fmla v31.4s, v11.4s, v7.s[3]\n" - "bge 167b\n" - "168:" // Height 8: Multiply loop: Single iteration only - "sub x15, x15, #0x4\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x12, #0x0]\n" - "ldr q2, [x10, #0x0]\n" - "ldr q3, [x28, #0x0]\n" - "ldr q4, [x26, #0x0]\n" - "ldr q5, [x24, #0x0]\n" - "ldr q6, [x22, #0x0]\n" - "ldr q7, [x20, #0x0]\n" - "ldr q12, [x7, #0x0]\n" + "161:" // Height 8: Multiply loop: Main loop skip + "cbz x28, 163f\n" + "162:" // Height 8: Multiply loop: Odd block loop + "ldr s0, [x27], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s5, [x22], #0x4\n" + "ldr s6, [x21], #0x4\n" + "ldr s7, [x20], #0x4\n" + "ldr q12, [x12, #0x0]\n" "fmla v24.4s, v12.4s, v0.s[0]\n" - "ldr q13, [x7, #0x10]\n" + "add x12, x12, #0x10\n" "fmla v25.4s, v12.4s, v1.s[0]\n" - "ldr q14, [x7, #0x20]\n" "fmla v26.4s, v12.4s, v2.s[0]\n" - "ldr q15, [x7, #0x30]\n" "fmla v27.4s, v12.4s, v3.s[0]\n" - "add x14, x14, #0x10\n" - "prfm pldl1keep, [x14, #0x80]\n" "fmla v28.4s, v12.4s, v4.s[0]\n" - "add x12, x12, #0x10\n" "fmla v29.4s, v12.4s, v5.s[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "add x10, x10, #0x10\n" "fmla v30.4s, v12.4s, v6.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" "fmla v31.4s, v12.4s, v7.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - "fmla v24.4s, v13.4s, v0.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" - "fmla v25.4s, v13.4s, v1.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "add x22, x22, #0x10\n" - "fmla v26.4s, v13.4s, v2.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "add x20, x20, #0x10\n" - "fmla v27.4s, v13.4s, v3.s[1]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "add x7, x7, #0x40\n" - "fmla v28.4s, v13.4s, v4.s[1]\n" - "fmla v29.4s, v13.4s, v5.s[1]\n" - "fmla v30.4s, v13.4s, v6.s[1]\n" - "fmla v31.4s, v13.4s, v7.s[1]\n" - "fmla v24.4s, v14.4s, v0.s[2]\n" - "fmla v25.4s, v14.4s, v1.s[2]\n" - "fmla v26.4s, v14.4s, v2.s[2]\n" - "fmla v27.4s, v14.4s, v3.s[2]\n" - "fmla v28.4s, v14.4s, v4.s[2]\n" - "fmla v29.4s, v14.4s, v5.s[2]\n" - "fmla v30.4s, v14.4s, v6.s[2]\n" - "fmla v31.4s, v14.4s, v7.s[2]\n" - "fmla v24.4s, v15.4s, v0.s[3]\n" - "fmla v25.4s, v15.4s, v1.s[3]\n" - "fmla v26.4s, v15.4s, v2.s[3]\n" - "fmla v27.4s, v15.4s, v3.s[3]\n" - "fmla v28.4s, v15.4s, v4.s[3]\n" - "fmla v29.4s, v15.4s, v5.s[3]\n" - "fmla v30.4s, v15.4s, v6.s[3]\n" - "fmla v31.4s, v15.4s, v7.s[3]\n" - "169:" // Height 8: Multiply loop: Main loop skip - "cbz x15, 171f\n" - "170:" // Height 8: Multiply loop: Odd block loop - "ldr s0, [x14], #0x4\n" - "ldr s1, [x12], #0x4\n" - "ldr s2, [x10], #0x4\n" - "ldr s3, [x28], #0x4\n" - "ldr s4, [x26], #0x4\n" - "ldr s5, [x24], #0x4\n" - "ldr s6, [x22], #0x4\n" - "ldr s7, [x20], #0x4\n" - "ldr q16, [x7, #0x0]\n" - "fmla v24.4s, v16.4s, v0.s[0]\n" - "sub x15, x15, #0x1\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "add x7, x7, #0x10\n" - "fmla v26.4s, v16.4s, v2.s[0]\n" - "fmla v27.4s, v16.4s, v3.s[0]\n" - "fmla v28.4s, v16.4s, v4.s[0]\n" - "fmla v29.4s, v16.4s, v5.s[0]\n" - "fmla v30.4s, v16.4s, v6.s[0]\n" - "fmla v31.4s, v16.4s, v7.s[0]\n" - "cbnz x15, 170b\n" - "171:" // Height 8: Multiply loop: No odd multiplies + "cbnz x28, 162b\n" + "163:" // Height 8: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x16, x16, #0x1\n" - "cmp x16, x19\n" - "bne 164b\n" - "prfm pstl1keep, [x17, #0x0]\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x27, #0x0]\n" + "add x9, x9, #0x1\n" + "cmp x9, x19\n" + "bne 156b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "add x26, x10, x19, LSL #2\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x19, LSL #2\n" "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x19, LSL #2\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" "prfm pstl1keep, [x21, #0x0]\n" - "tbz %x[flags], #1, 172f\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbz %x[flags], #1, 164f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1r { v17.4s }, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2120,75 +2010,68 @@ void a64_hybrid_fp32_mla_8x4 ( "fmax v30.4s, v30.4s, v17.4s\n" "fmin v31.4s, v31.4s, v16.4s\n" "fmax v31.4s, v31.4s, v17.4s\n" - "172:" // Height 8: No activation - "cmp x6, #0x4\n" - "bge 175f\n" - "tbz x6, #1, 173f\n" - "str d24, [x17], #0x8\n" - "str d25, [x13], #0x8\n" - "str d26, [x11], #0x8\n" - "str d27, [x9], #0x8\n" - "str d28, [x27], #0x8\n" - "str d29, [x25], #0x8\n" - "str d30, [x23], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x6, #0, 174f\n" - "st1 { v24.s }[2], [x17]\n" - "st1 { v25.s }[2], [x13]\n" - "st1 { v26.s }[2], [x11]\n" - "st1 { v27.s }[2], [x9]\n" - "st1 { v28.s }[2], [x27]\n" - "st1 { v29.s }[2], [x25]\n" - "st1 { v30.s }[2], [x23]\n" - "st1 { v31.s }[2], [x21]\n" - "b 174f\n" - "173:" // Height 8: Partial direct writeback: partial_1_0 - "str s24, [x17, #0x0]\n" - "str s25, [x13, #0x0]\n" - "str s26, [x11, #0x0]\n" - "str s27, [x9, #0x0]\n" - "str s28, [x27, #0x0]\n" - "str s29, [x25, #0x0]\n" - "str s30, [x23, #0x0]\n" - "str s31, [x21, #0x0]\n" - "174:" // Height 8: Partial direct writeback: Done - "b 176f\n" - "175:" // Height 8: Full writeback - "str q24, [x17, #0x0]\n" - "str q25, [x13, #0x0]\n" - "str q26, [x11, #0x0]\n" - "str q27, [x9, #0x0]\n" - "str q28, [x27, #0x0]\n" - "str q29, [x25, #0x0]\n" - "str q30, [x23, #0x0]\n" - "str q31, [x21, #0x0]\n" - "add x17, x17, #0x10\n" - "add x13, x13, #0x10\n" - "add x11, x11, #0x10\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "add x21, x21, #0x10\n" - "176:" // Height 8: Writeback done - "subs x6, x6, #0x4\n" - "bgt 157b\n" + "164:" // Height 8: No activation + "cmp x13, #0x4\n" + "bge 167f\n" + "tbz x13, #1, 165f\n" + "str d24, [x10], #0x8\n" + "str d25, [x26], #0x8\n" + "str d26, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "str d29, [x22], #0x8\n" + "str d30, [x21], #0x8\n" + "str d31, [x20], #0x8\n" + "tbz x13, #0, 166f\n" + "st1 { v24.s }[2], [x10]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v26.s }[2], [x25]\n" + "st1 { v27.s }[2], [x24]\n" + "st1 { v28.s }[2], [x23]\n" + "st1 { v29.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "st1 { v31.s }[2], [x20]\n" + "b 166f\n" + "165:" // Height 8: Partial direct writeback: partial_1_0 + "str s24, [x10, #0x0]\n" + "str s25, [x26, #0x0]\n" + "str s26, [x25, #0x0]\n" + "str s27, [x24, #0x0]\n" + "str s28, [x23, #0x0]\n" + "str s29, [x22, #0x0]\n" + "str s30, [x21, #0x0]\n" + "str s31, [x20, #0x0]\n" + "166:" // Height 8: Partial direct writeback: Done + "b 168f\n" + "167:" // Height 8: Full writeback + "str q24, [x10, #0x0]\n" + "add x10, x10, #0x10\n" + "str q25, [x26, #0x0]\n" + "str q26, [x25, #0x0]\n" + "str q27, [x24, #0x0]\n" + "str q28, [x23, #0x0]\n" + "str q29, [x22, #0x0]\n" + "str q30, [x21, #0x0]\n" + "str q31, [x20, #0x0]\n" + "168:" // Height 8: Writeback done + "subs x13, x13, #0x4\n" + "bgt 149b\n" "subs %x[M], %x[M], #0x8\n" - "beq 178f\n" + "beq 170f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 177f\n" + "tbz %x[flags], #3, 169f\n" "add x20, x20, #0x8\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "177:" // Update direct input + "169:" // Update direct input "mov x19, #0x20\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "178:" // Exit + "170:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp index 5f14072f0a..caef6396be 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_s8qa_dot_4x16( ARGLIST ); +void a64_hybrid_s8qa_dot_4x16_a55( ARGLIST ); class cls_a64_hybrid_s8qa_dot_4x16 { @@ -72,7 +72,8 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: return { 7.5301 }; @@ -83,9 +84,15 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_s8qa_dot_4x16; - - cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *) + cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_hybrid_s8qa_dot_4x16_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp new file mode 100644 index 0000000000..fb85506ee8 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp @@ -0,0 +1,2148 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_s8qa_dot_4x16_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 91f\n" + "cmp %x[M], #0x2\n" + "bgt 61f\n" + "beq 31f\n" + "movi v11.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "3:" // Height 1: setup done + "mov x27, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x26, #0x10\n" + "blt 11f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "cmp x26, #0x20\n" + "blt 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "ldr x24, [x10, #0x18]\n" + "add x25, x25, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "ldr x23, [x10, #0x28]\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr x23, [x10, #0x48]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + "ldr d9, [x10, #0x50]\n" + "ldr x19, [x10, #0x58]\n" + "mov v8.d[1], x23\n" + "ldr d10, [x10, #0x60]\n" + "ldr x23, [x10, #0x68]\n" + "mov v9.d[1], x19\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + "ldr d4, [x10, #0x70]\n" + "mov v10.d[1], x23\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + "ldr x19, [x10, #0x78]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + "ldr d5, [x10, #0x80]\n" + "ldr x24, [x10, #0x88]\n" + "mov v4.d[1], x19\n" + "ldr d6, [x10, #0x90]\n" + "ldr x23, [x10, #0x98]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr d7, [x10, #0xa0]\n" + "mov v6.d[1], x23\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr x19, [x10, #0xa8]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr d8, [x10, #0xb0]\n" + "ldr x23, [x10, #0xb8]\n" + "mov v7.d[1], x19\n" + "ldr d9, [x10, #0xc0]\n" + "ldr x19, [x10, #0xc8]\n" + "mov v8.d[1], x23\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + "ldr d10, [x10, #0xd0]\n" + "mov v9.d[1], x19\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "ldr x23, [x10, #0xd8]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + "ldr d4, [x10, #0xe0]\n" + "ldr x19, [x10, #0xe8]\n" + "mov v10.d[1], x23\n" + "ldr d5, [x10, #0xf0]\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + "mov v4.d[1], x19\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 8f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "ldr q0, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q4, [x10, #0x0]\n" + "bge 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x25, x25, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q9, [x10, #0x50]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q10, [x10, #0x60]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q8, [x10, #0xb0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q9, [x10, #0xc0]\n" + "ldr q10, [x10, #0xd0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q4, [x10, #0xe0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + "ldr q5, [x10, #0xf0]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 10f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "10:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "11:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 18f\n" + "cmp x26, #0x4\n" + "blt 14f\n" + "12:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "tbnz %x[flags], #31, 13f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "13:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + "bge 12b\n" + "cbz x26, 18f\n" + "14:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #1, 15f\n" + "ldr h0, [x25], #0x2\n" + "tbz x26, #0, 16f\n" + "ld1 { v0.b }[2], [x25]\n" + "b 16f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "16:" // Height 1: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 17f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "17:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + "18:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 4b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "tbnz %x[flags], #31, 19f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v1.4s }, [x22]\n" + "neg v1.4s, v1.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" + "19:" // Height 1: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "ldr q3, [x9, #0x30]\n" + "add v17.4s, v17.4s, v1.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v18.4s, v18.4s, v2.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x9, x9, #0x40\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 20f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "20:" // Height 1: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 29f\n" + "tbz x11, #3, 24f\n" + "str d16, [x28], #0x8\n" + "tbz x11, #2, 22f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "tbz x11, #1, 21f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[14], [x28]\n" + "b 28f\n" + "21:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 28f\n" + "st1 { v16.b }[12], [x28]\n" + "b 28f\n" + "22:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 23f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[10], [x28]\n" + "b 28f\n" + "23:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 28f\n" + "st1 { v16.b }[8], [x28]\n" + "b 28f\n" + "24:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 26f\n" + "str s16, [x28], #0x4\n" + "tbz x11, #1, 25f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[6], [x28]\n" + "b 28f\n" + "25:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 28f\n" + "st1 { v16.b }[4], [x28]\n" + "b 28f\n" + "26:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 27f\n" + "str h16, [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[2], [x28]\n" + "b 28f\n" + "27:" // Height 1: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "28:" // Height 1: Partial direct writeback: Done + "b 30f\n" + "29:" // Height 1: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "30:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 122f\n" + "31:" // Height 2 + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "32:" // Height 2: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "33:" // Height 2: setup done + "mov x27, #0x0\n" + "34:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 35f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x27, 36f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x22, x22, x19\n" + "b 36f\n" + "35:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x22, x25, x19\n" + "36:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 41f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q4, [x10, #0x0]\n" + "blt 39f\n" + "37:" // Height 2: Multiply loop: Main loop head + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "add x25, x25, #0x10\n" + "ldr x24, [x10, #0x18]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "ldr x23, [x10, #0x28]\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "ldr x23, [x10, #0x48]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + "ldr d9, [x10, #0x50]\n" + "mov v8.d[1], x23\n" + "ldr x19, [x10, #0x58]\n" + "ldr d10, [x10, #0x60]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + "mov v9.d[1], x19\n" + "ldr x23, [x10, #0x68]\n" + "ldr d4, [x10, #0x70]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + "mov v10.d[1], x23\n" + "ldr x19, [x10, #0x78]\n" + "ldr d5, [x10, #0x80]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + "mov v4.d[1], x19\n" + "ldr x24, [x10, #0x88]\n" + "ldr d6, [x10, #0x90]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "mov v5.d[1], x24\n" + "ldr x23, [x10, #0x98]\n" + "ldr d7, [x10, #0xa0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "mov v6.d[1], x23\n" + "ldr x19, [x10, #0xa8]\n" + "ldr d8, [x10, #0xb0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr x23, [x10, #0xb8]\n" + "ldr d9, [x10, #0xc0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + "mov v8.d[1], x23\n" + "ldr x19, [x10, #0xc8]\n" + "ldr d10, [x10, #0xd0]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + "mov v9.d[1], x19\n" + "ldr x23, [x10, #0xd8]\n" + "ldr d4, [x10, #0xe0]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + "mov v10.d[1], x23\n" + "ldr x19, [x10, #0xe8]\n" + "ldr d5, [x10, #0xf0]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + "mov v4.d[1], x19\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 38f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "38:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "bge 37b\n" + "39:" // Height 2: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "sub x26, x26, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + "ldr q9, [x10, #0x50]\n" + "ldr q10, [x10, #0x60]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + "ldr q7, [x10, #0xa0]\n" + "ldr q8, [x10, #0xb0]\n" + "ldr q9, [x10, #0xc0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + "ldr q10, [x10, #0xd0]\n" + "ldr q4, [x10, #0xe0]\n" + "ldr q5, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "40:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "41:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 48f\n" + "cmp x26, #0x4\n" + "blt 44f\n" + "42:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "ldr s1, [x22], #0x4\n" + "tbnz %x[flags], #31, 43f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "43:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + "bge 42b\n" + "cbz x26, 48f\n" + "44:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #1, 45f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x22], #0x2\n" + "tbz x26, #0, 46f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x22]\n" + "b 46f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "46:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 47f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "47:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + "48:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 34b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x21, x28, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 49f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x22]\n" + "neg v2.4s, v2.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "49:" // Height 2: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "add v20.4s, v20.4s, v12.4s\n" + "add v21.4s, v21.4s, v12.4s\n" + "add v22.4s, v22.4s, v12.4s\n" + "add v23.4s, v23.4s, v12.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "ldr q3, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "tbz %x[flags], #5, 50f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "and v9.16b, v21.16b, v0.16b\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "50:" // Height 2: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 59f\n" + "tbz x11, #3, 54f\n" + "str d16, [x28], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x11, #2, 52f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz x11, #1, 51f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[14], [x28]\n" + "st1 { v20.b }[14], [x21]\n" + "b 58f\n" + "51:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 58f\n" + "st1 { v16.b }[12], [x28]\n" + "st1 { v20.b }[12], [x21]\n" + "b 58f\n" + "52:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 53f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[10], [x28]\n" + "st1 { v20.b }[10], [x21]\n" + "b 58f\n" + "53:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 58f\n" + "st1 { v16.b }[8], [x28]\n" + "st1 { v20.b }[8], [x21]\n" + "b 58f\n" + "54:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 56f\n" + "str s16, [x28], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz x11, #1, 55f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[6], [x28]\n" + "st1 { v20.b }[6], [x21]\n" + "b 58f\n" + "55:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 58f\n" + "st1 { v16.b }[4], [x28]\n" + "st1 { v20.b }[4], [x21]\n" + "b 58f\n" + "56:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 57f\n" + "str h16, [x28], #0x2\n" + "str h20, [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[2], [x28]\n" + "st1 { v20.b }[2], [x21]\n" + "b 58f\n" + "57:" // Height 2: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "str b20, [x21, #0x0]\n" + "58:" // Height 2: Partial direct writeback: Done + "b 60f\n" + "59:" // Height 2: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "str q20, [x21, #0x0]\n" + "60:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 32b\n" + "b 122f\n" + "61:" // Height 3 + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[col_bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "62:" // Height 3: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "63:" // Height 3: setup done + "mov x27, #0x0\n" + "64:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 65f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x27, 66f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 66f\n" + "65:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x22, x25, x19\n" + "add x21, x22, x19\n" + "66:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 71f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "blt 69f\n" + "67:" // Height 3: Multiply loop: Main loop head + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "ldr x24, [x10, #0x18]\n" + "add x25, x25, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" + "ldr x23, [x10, #0x28]\n" + "add x21, x21, #0x10\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "ldr x23, [x10, #0x48]\n" + "ldr d9, [x10, #0x50]\n" + "ldr x19, [x10, #0x58]\n" + "mov v8.d[1], x23\n" + "ldr d10, [x10, #0x60]\n" + "ldr x23, [x10, #0x68]\n" + "mov v9.d[1], x19\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + "mov v10.d[1], x23\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + "ldr d4, [x10, #0x70]\n" + "ldr x19, [x10, #0x78]\n" + "ldr d5, [x10, #0x80]\n" + "ldr x24, [x10, #0x88]\n" + "mov v4.d[1], x19\n" + "ldr d6, [x10, #0x90]\n" + "ldr x23, [x10, #0x98]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "mov v6.d[1], x23\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + "ldr d7, [x10, #0xa0]\n" + "ldr x19, [x10, #0xa8]\n" + "ldr d8, [x10, #0xb0]\n" + "ldr x23, [x10, #0xb8]\n" + "mov v7.d[1], x19\n" + "ldr d9, [x10, #0xc0]\n" + "ldr x19, [x10, #0xc8]\n" + "mov v8.d[1], x23\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + "mov v9.d[1], x19\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + "ldr d10, [x10, #0xd0]\n" + "ldr x23, [x10, #0xd8]\n" + "ldr d4, [x10, #0xe0]\n" + "ldr x19, [x10, #0xe8]\n" + "mov v10.d[1], x23\n" + "ldr d5, [x10, #0xf0]\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + "mov v4.d[1], x19\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "68:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "bge 67b\n" + "69:" // Height 3: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x25, x25, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + "ldr q9, [x10, #0x50]\n" + "add x22, x22, #0x10\n" + "ldr q10, [x10, #0x60]\n" + "add x21, x21, #0x10\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + "ldr q7, [x10, #0xa0]\n" + "ldr q8, [x10, #0xb0]\n" + "ldr q9, [x10, #0xc0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + "ldr q10, [x10, #0xd0]\n" + "ldr q4, [x10, #0xe0]\n" + "ldr q5, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 70f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "70:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "71:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 78f\n" + "cmp x26, #0x4\n" + "blt 74f\n" + "72:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "tbnz %x[flags], #31, 73f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "73:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + "bge 72b\n" + "cbz x26, 78f\n" + "74:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #1, 75f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "tbz x26, #0, 76f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" + "b 76f\n" + "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" + "76:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 77f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "77:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + "78:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 64b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x21, x28, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbnz %x[flags], #31, 79f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x22]\n" + "neg v3.4s, v3.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "79:" // Height 3: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "add v20.4s, v20.4s, v12.4s\n" + "add v21.4s, v21.4s, v12.4s\n" + "add v22.4s, v22.4s, v12.4s\n" + "add v23.4s, v23.4s, v12.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "ldr q3, [x9, #0x30]\n" + "ld1r { v0.4s }, [x23]\n" + "add x9, x9, #0x40\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 80f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "and v9.16b, v21.16b, v0.16b\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v23.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "and v5.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v7.16b, v26.16b, v0.16b\n" + "and v8.16b, v27.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "80:" // Height 3: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 89f\n" + "tbz x11, #3, 84f\n" + "str d16, [x28], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x11, #2, 82f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x11, #1, 81f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[14], [x28]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 88f\n" + "81:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 88f\n" + "st1 { v16.b }[12], [x28]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 88f\n" + "82:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 83f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[10], [x28]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 88f\n" + "83:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 88f\n" + "st1 { v16.b }[8], [x28]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 88f\n" + "84:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 86f\n" + "str s16, [x28], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x11, #1, 85f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[6], [x28]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 88f\n" + "85:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 88f\n" + "st1 { v16.b }[4], [x28]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 88f\n" + "86:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 87f\n" + "str h16, [x28], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[2], [x28]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 88f\n" + "87:" // Height 3: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "88:" // Height 3: Partial direct writeback: Done + "b 90f\n" + "89:" // Height 3: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "90:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 62b\n" + "b 122f\n" + "91:" // Height 4 + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[col_bias]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "mov x19, #0x4\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "92:" // Height 4: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "93:" // Height 4: setup done + "mov x27, #0x0\n" + "94:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 95f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x27, 96f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 96f\n" + "95:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x22, x25, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "96:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 101f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x21, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "blt 99f\n" + "97:" // Height 4: Multiply loop: Main loop head + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "add x25, x25, #0x10\n" + "ldr x24, [x10, #0x18]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" + "ldr x23, [x10, #0x28]\n" + "add x20, x20, #0x10\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "ldr x23, [x10, #0x48]\n" + "ldr d9, [x10, #0x50]\n" + "ldr x19, [x10, #0x58]\n" + "mov v8.d[1], x23\n" + "ldr d10, [x10, #0x60]\n" + "ldr x23, [x10, #0x68]\n" + "mov v9.d[1], x19\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + "mov v10.d[1], x23\n" + ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" + "ldr d4, [x10, #0x70]\n" + "ldr x19, [x10, #0x78]\n" + "ldr d5, [x10, #0x80]\n" + "ldr x24, [x10, #0x88]\n" + "mov v4.d[1], x19\n" + "ldr d6, [x10, #0x90]\n" + "ldr x23, [x10, #0x98]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "mov v6.d[1], x23\n" + ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" + "ldr d7, [x10, #0xa0]\n" + "ldr x19, [x10, #0xa8]\n" + "ldr d8, [x10, #0xb0]\n" + "ldr x23, [x10, #0xb8]\n" + "mov v7.d[1], x19\n" + "ldr d9, [x10, #0xc0]\n" + "ldr x19, [x10, #0xc8]\n" + "mov v8.d[1], x23\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + "mov v9.d[1], x19\n" + ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n" + "ldr d10, [x10, #0xd0]\n" + "ldr x23, [x10, #0xd8]\n" + "ldr d4, [x10, #0xe0]\n" + "ldr x19, [x10, #0xe8]\n" + "mov v10.d[1], x23\n" + "ldr d5, [x10, #0xf0]\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + "mov v4.d[1], x19\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 98f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "98:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "bge 97b\n" + "99:" // Height 4: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "sub x26, x26, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + "add x21, x21, #0x10\n" + "ldr q9, [x10, #0x50]\n" + "add x20, x20, #0x10\n" + "ldr q10, [x10, #0x60]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x10, #0xa0]\n" + "ldr q8, [x10, #0xb0]\n" + "ldr q9, [x10, #0xc0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n" + "ldr q10, [x10, #0xd0]\n" + "ldr q4, [x10, #0xe0]\n" + "ldr q5, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 100f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "100:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "101:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 108f\n" + "cmp x26, #0x4\n" + "blt 104f\n" + "102:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s3, [x20], #0x4\n" + "tbnz %x[flags], #31, 103f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "103:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" + "bge 102b\n" + "cbz x26, 108f\n" + "104:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #1, 105f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "ldr h3, [x20], #0x2\n" + "tbz x26, #0, 106f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" + "ld1 { v3.b }[2], [x20]\n" + "b 106f\n" + "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" + "ldr b3, [x20, #0x0]\n" + "106:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 107f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "107:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n" + "108:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 94b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x21, x28, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "prfm pstl1keep, [x19, #0x0]\n" + "tbnz %x[flags], #31, 109f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "neg v4.4s, v4.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "109:" // Height 4: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "add v20.4s, v20.4s, v12.4s\n" + "add v21.4s, v21.4s, v12.4s\n" + "add v22.4s, v22.4s, v12.4s\n" + "add v23.4s, v23.4s, v12.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v28.4s, v28.4s, v14.4s\n" + "add v29.4s, v29.4s, v14.4s\n" + "add v30.4s, v30.4s, v14.4s\n" + "add v31.4s, v31.4s, v14.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "ldr q3, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v4.4s\n" + "sqrdmulh v30.4s, v30.4s, v4.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "tbz %x[flags], #5, 110f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "and v9.16b, v21.16b, v0.16b\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v23.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "and v5.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v7.16b, v26.16b, v0.16b\n" + "and v8.16b, v27.16b, v0.16b\n" + "and v9.16b, v28.16b, v0.16b\n" + "and v10.16b, v29.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "and v4.16b, v30.16b, v0.16b\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "and v5.16b, v31.16b, v0.16b\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "sqadd v28.4s, v28.4s, v9.4s\n" + "sqadd v29.4s, v29.4s, v10.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v4.4s\n" + "sqadd v31.4s, v31.4s, v5.4s\n" + "110:" // Height 4: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "srshl v30.4s, v30.4s, v0.4s\n" + "srshl v31.4s, v31.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v28.8h, v28.8h, v29.8h\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 119f\n" + "tbz x11, #3, 114f\n" + "str d16, [x28], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x11, #2, 112f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v28.s }[2], [x19], #0x4\n" + "tbz x11, #1, 111f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v28.h }[6], [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[14], [x28]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "st1 { v28.b }[14], [x19]\n" + "b 118f\n" + "111:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 118f\n" + "st1 { v16.b }[12], [x28]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "st1 { v28.b }[12], [x19]\n" + "b 118f\n" + "112:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 113f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v28.h }[4], [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[10], [x28]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "st1 { v28.b }[10], [x19]\n" + "b 118f\n" + "113:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 118f\n" + "st1 { v16.b }[8], [x28]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "st1 { v28.b }[8], [x19]\n" + "b 118f\n" + "114:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 116f\n" + "str s16, [x28], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "str s28, [x19], #0x4\n" + "tbz x11, #1, 115f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v28.h }[2], [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[6], [x28]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "st1 { v28.b }[6], [x19]\n" + "b 118f\n" + "115:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 118f\n" + "st1 { v16.b }[4], [x28]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "st1 { v28.b }[4], [x19]\n" + "b 118f\n" + "116:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 117f\n" + "str h16, [x28], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "str h28, [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[2], [x28]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "st1 { v28.b }[2], [x19]\n" + "b 118f\n" + "117:" // Height 4: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "str b28, [x19, #0x0]\n" + "118:" // Height 4: Partial direct writeback: Done + "b 120f\n" + "119:" // Height 4: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "str q28, [x19, #0x0]\n" + "120:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 92b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 122f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 121f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "121:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "122:" // Exit + + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp index 17575bd611..0adfb99f23 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp @@ -81,216 +81,207 @@ void a64_hybrid_s8qa_dot_4x16 ( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 94f\n" + "bge 91f\n" "cmp %x[M], #0x2\n" - "bgt 63f\n" - "beq 32f\n" + "bgt 61f\n" + "beq 31f\n" "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x10, %x[col_bias]\n" - "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "movi v14.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 2f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "add x9, x9, x19\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x9, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "4:" // Height 1: setup done - "mov x28, #0x0\n" - "5:" // Height 1: String loop + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 6f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "cbnz x28, 7f\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "b 7f\n" - "6:" // Height 1: setup direct input - "mov x26, %x[input_ptr]\n" - "7:" // Height 1: input setup done - "cmp x27, #0x10\n" - "blt 12f\n" - "cmp x27, #0x20\n" - "blt 10f\n" - "8:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "blt 11f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "cmp x24, #0x20\n" + "blt 9f\n" + "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" - "ldr q6, [x11, #0x20]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q8, [x11, #0x40]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x60]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q9, [x28, #0x50]\n" + "ldr q10, [x28, #0x60]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q4, [x28, #0x70]\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q5, [x11, #0x80]\n" - "ldr q6, [x11, #0x90]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q6, [x28, #0x90]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q9, [x28, #0xc0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "ldr q10, [x11, #0xd0]\n" - "ldr q4, [x11, #0xe0]\n" ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" - "add x26, x26, #0x10\n" + "ldr q4, [x28, #0xe0]\n" + "ldr q5, [x28, #0xf0]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "add x28, x28, #0x100\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 8f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "ldr q0, [x23, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q4, [x28, #0x0]\n" + "bge 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x28, #0x30]\n" + "ldr q8, [x28, #0x40]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q10, [x28, #0x60]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q5, [x28, #0x80]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q7, [x28, #0xa0]\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q10, [x28, #0xd0]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - "add x11, x11, #0x100\n" ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - "tbnz %x[flags], #31, 9f\n" + "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - "9:" // Height 1: Multiply loop: unique 1: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "cmp x27, #0x20\n" - "bge 8b\n" - "10:" // Height 1: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "10:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "11:" // Height 1: Multiply loop: Main loop skip + "cbz x24, 18f\n" + "cmp x24, #0x4\n" + "blt 14f\n" + "12:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "tbnz %x[flags], #31, 13f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + "13:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" - "ldr q8, [x11, #0x20]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x30]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x40]\n" - "ldr q4, [x11, #0x50]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x60]\n" - "ldr q6, [x11, #0x70]\n" - ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" - ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" - "ldr q7, [x11, #0x80]\n" - "ldr q8, [x11, #0x90]\n" - ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" - "ldr q5, [x11, #0xd0]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - "add x26, x26, #0x10\n" - ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" - "add x11, x11, #0x100\n" - ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" - "tbnz %x[flags], #31, 11f\n" + "bge 12b\n" + "cbz x24, 18f\n" + "14:" // Height 1: Multiply loop: Skip odd blocks + "tbz x24, #1, 15f\n" + "ldr h0, [x23], #0x2\n" + "tbz x24, #0, 16f\n" + "ld1 { v0.b }[2], [x23]\n" + "b 16f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "16:" // Height 1: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 17f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - "11:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "12:" // Height 1: Multiply loop: Main loop skip - "cbz x27, 19f\n" - "cmp x27, #0x4\n" - "blt 15f\n" - "13:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "tbnz %x[flags], #31, 14f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - "14:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - "ldr q10, [x11, #0x20]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q4, [x11, #0x30]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "sub x27, x27, #0x4\n" - "add x11, x11, #0x40\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - "cmp x27, #0x4\n" - "bge 13b\n" - "cbz x27, 19f\n" - "15:" // Height 1: Multiply loop: Skip odd blocks - "tbz x27, #1, 16f\n" - "ldr h0, [x26], #0x2\n" - "tbz x27, #0, 17f\n" - "ld1 { v0.b }[2], [x26]\n" - "b 17f\n" - "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "17:" // Height 1: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 18f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - "18:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - "ldr q8, [x11, #0x30]\n" - ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" - "19:" // Height 1: Multiply loop: No odd multiplies + "17:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + "ldr q6, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + "18:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 5b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbnz %x[flags], #31, 20f\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 4b\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbnz %x[flags], #31, 19f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" + "add x22, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v1.4s }, [x19]\n" + "ld1r { v1.4s }, [x22]\n" "neg v1.4s, v1.4s\n" "mul v11.4s, v11.4s, v1.4s\n" - "20:" // Height 1: skip row sum fixup + "19:" // Height 1: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" "add v17.4s, v17.4s, v1.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" "add v18.4s, v18.4s, v2.4s\n" - "ld1r { v4.4s }, [x19]\n" - "add x10, x10, #0x40\n" "add v19.4s, v19.4s, v3.4s\n" "sqrdmulh v16.4s, v16.4s, v4.4s\n" "sqrdmulh v17.4s, v17.4s, v4.4s\n" "sqrdmulh v18.4s, v18.4s, v4.4s\n" "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "tbz %x[flags], #5, 21f\n" + "tbz %x[flags], #5, 20f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -303,18 +294,18 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqadd v17.4s, v17.4s, v5.4s\n" "sqadd v18.4s, v18.4s, v6.4s\n" "sqadd v19.4s, v19.4s, v7.4s\n" - "21:" // Height 1: no shift correction + "20:" // Height 1: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -330,80 +321,69 @@ void a64_hybrid_s8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 30f\n" - "tbz x12, #3, 25f\n" - "str d16, [x9], #0x8\n" - "tbz x12, #2, 23f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "tbz x12, #1, 22f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[14], [x9]\n" - "b 29f\n" - "22:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x12, #0, 29f\n" - "st1 { v16.b }[12], [x9]\n" - "b 29f\n" - "23:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x12, #1, 24f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[10], [x9]\n" - "b 29f\n" - "24:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x12, #0, 29f\n" - "st1 { v16.b }[8], [x9]\n" - "b 29f\n" - "25:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x12, #2, 27f\n" - "str s16, [x9], #0x4\n" - "tbz x12, #1, 26f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[6], [x9]\n" - "b 29f\n" - "26:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x12, #0, 29f\n" - "st1 { v16.b }[4], [x9]\n" - "b 29f\n" - "27:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x12, #1, 28f\n" - "str h16, [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[2], [x9]\n" - "b 29f\n" - "28:" // Height 1: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "29:" // Height 1: Partial direct writeback: Done - "b 31f\n" - "30:" // Height 1: Full writeback - "str q16, [x9, #0x0]\n" - "add x9, x9, #0x10\n" - "31:" // Height 1: Writeback done - "subs x12, x12, #0x10\n" - "bgt 3b\n" - "b 126f\n" - "32:" // Height 2 + "bge 29f\n" + "tbz x9, #3, 24f\n" + "str d16, [x26], #0x8\n" + "tbz x9, #2, 22f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x9, #1, 21f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[14], [x26]\n" + "b 28f\n" + "21:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x9, #0, 28f\n" + "st1 { v16.b }[12], [x26]\n" + "b 28f\n" + "22:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x9, #1, 23f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[10], [x26]\n" + "b 28f\n" + "23:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x9, #0, 28f\n" + "st1 { v16.b }[8], [x26]\n" + "b 28f\n" + "24:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x9, #2, 26f\n" + "str s16, [x26], #0x4\n" + "tbz x9, #1, 25f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[6], [x26]\n" + "b 28f\n" + "25:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x9, #0, 28f\n" + "st1 { v16.b }[4], [x26]\n" + "b 28f\n" + "26:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x9, #1, 27f\n" + "str h16, [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[2], [x26]\n" + "b 28f\n" + "27:" // Height 1: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "28:" // Height 1: Partial direct writeback: Done + "b 30f\n" + "29:" // Height 1: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "30:" // Height 1: Writeback done + "subs x9, x9, #0x10\n" + "bgt 2b\n" + "b 122f\n" + "31:" // Height 2 "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" - "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "movi v14.4s, #0x0\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 33f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "add x25, x25, x19\n" - "b 34f\n" - "33:" // Height 2: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "34:" // Height 2: Column loop + "mov x26, %x[output_ptr]\n" + "32:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -412,71 +392,135 @@ void a64_hybrid_s8qa_dot_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "35:" // Height 2: setup done - "mov x28, #0x0\n" - "36:" // Height 2: String loop + "33:" // Height 2: setup done + "mov x25, #0x0\n" + "34:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 37f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 35f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "cbnz x28, 38f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 36f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" - "b 38f\n" - "37:" // Height 2: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "38:" // Height 2: input setup done - "cmp x27, #0x10\n" - "blt 43f\n" - "cmp x27, #0x20\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 36f\n" + "35:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "36:" // Height 2: input setup done + "cmp x24, #0x10\n" "blt 41f\n" - "39:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q4, [x28, #0x0]\n" + "blt 39f\n" + "37:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x20]\n" - "ldr q7, [x11, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q8, [x11, #0x40]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x60]\n" + "ldr q9, [x28, #0x50]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q10, [x28, #0x60]\n" + "ldr q4, [x28, #0x70]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x80]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x90]\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q6, [x28, #0x90]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "add x26, x26, #0x10\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + "ldr q10, [x28, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 38f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "38:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "bge 37b\n" + "39:" // Height 2: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q8, [x28, #0x40]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "ldr q10, [x28, #0x60]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q9, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "add x24, x24, #0x10\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x11, #0xd0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x11, #0xe0]\n" + "ldr q4, [x28, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" @@ -493,171 +537,109 @@ void a64_hybrid_s8qa_dot_4x16 ( "tbnz %x[flags], #31, 40f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 5: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x20\n" - "bge 39b\n" - "41:" // Height 2: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "40:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "41:" // Height 2: Multiply loop: Main loop skip + "cbz x24, 48f\n" + "cmp x24, #0x4\n" + "blt 44f\n" + "42:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "ldr s1, [x22], #0x4\n" + "tbnz %x[flags], #31, 43f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + "43:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x20]\n" - "ldr q9, [x11, #0x30]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x40]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - "ldr q4, [x11, #0x50]\n" ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x60]\n" ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x70]\n" ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x80]\n" ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x90]\n" - ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" - "add x26, x26, #0x10\n" - ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" - "add x24, x24, #0x10\n" - ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n" - "ldr q5, [x11, #0xd0]\n" - ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" - ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 42f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "42:" // Height 2: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "43:" // Height 2: Multiply loop: Main loop skip - "cbz x27, 50f\n" - "cmp x27, #0x4\n" - "blt 46f\n" - "44:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x24], #0x4\n" - "tbnz %x[flags], #31, 45f\n" + "bge 42b\n" + "cbz x24, 48f\n" + "44:" // Height 2: Multiply loop: Skip odd blocks + "tbz x24, #1, 45f\n" + "ldr h0, [x23], #0x2\n" + "ldr h1, [x22], #0x2\n" + "tbz x24, #0, 46f\n" + "ld1 { v0.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x22]\n" + "b 46f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "46:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 47f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x20]\n" - "ldr q4, [x11, #0x30]\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "sub x27, x27, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "cmp x27, #0x4\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - "bge 44b\n" - "cbz x27, 50f\n" - "46:" // Height 2: Multiply loop: Skip odd blocks - "tbz x27, #1, 47f\n" - "ldr h0, [x26], #0x2\n" - "ldr h1, [x24], #0x2\n" - "tbz x27, #0, 48f\n" - "ld1 { v0.b }[2], [x26]\n" - "ld1 { v1.b }[2], [x24]\n" - "b 48f\n" - "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "ldr b1, [x24, #0x0]\n" - "48:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 49f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - "49:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" - "ldr q7, [x11, #0x20]\n" - "ldr q8, [x11, #0x30]\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n" - "50:" // Height 2: Multiply loop: No odd multiplies + "47:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + "add x28, x28, #0x40\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + "48:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 36b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "tbnz %x[flags], #31, 51f\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 34b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 49f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x19]\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "neg v2.4s, v2.4s\n" "mul v11.4s, v11.4s, v2.4s\n" "mul v12.4s, v12.4s, v2.4s\n" - "51:" // Height 2: skip row sum fixup + "49:" // Height 2: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v20.4s, v20.4s, v12.4s\n" - "ldr q3, [x10, #0x30]\n" + "ld1r { v4.4s }, [x22]\n" "add v21.4s, v21.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" "add v22.4s, v22.4s, v12.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" "add v23.4s, v23.4s, v12.4s\n" - "add x10, x10, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" "add v17.4s, v17.4s, v1.4s\n" "add v18.4s, v18.4s, v2.4s\n" "add v19.4s, v19.4s, v3.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" "add v21.4s, v21.4s, v1.4s\n" "add v22.4s, v22.4s, v2.4s\n" "add v23.4s, v23.4s, v3.4s\n" @@ -669,7 +651,7 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v21.4s, v21.4s, v4.4s\n" "sqrdmulh v22.4s, v22.4s, v4.4s\n" "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "tbz %x[flags], #5, 52f\n" + "tbz %x[flags], #5, 50f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -694,18 +676,18 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqadd v21.4s, v21.4s, v9.4s\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" - "52:" // Height 2: no shift correction + "50:" // Height 2: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" @@ -740,100 +722,86 @@ void a64_hybrid_s8qa_dot_4x16 ( "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v20.16b, v20.16b, v21.16b\n" - "bge 61f\n" - "tbz x12, #3, 56f\n" - "str d16, [x9], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x12, #2, 54f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "tbz x12, #1, 53f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[14], [x9]\n" - "st1 { v20.b }[14], [x25]\n" - "b 60f\n" - "53:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x12, #0, 60f\n" - "st1 { v16.b }[12], [x9]\n" - "st1 { v20.b }[12], [x25]\n" - "b 60f\n" - "54:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x12, #1, 55f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[10], [x9]\n" - "st1 { v20.b }[10], [x25]\n" - "b 60f\n" - "55:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x12, #0, 60f\n" - "st1 { v16.b }[8], [x9]\n" - "st1 { v20.b }[8], [x25]\n" - "b 60f\n" - "56:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x12, #2, 58f\n" - "str s16, [x9], #0x4\n" - "str s20, [x25], #0x4\n" - "tbz x12, #1, 57f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[6], [x9]\n" - "st1 { v20.b }[6], [x25]\n" - "b 60f\n" - "57:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x12, #0, 60f\n" - "st1 { v16.b }[4], [x9]\n" - "st1 { v20.b }[4], [x25]\n" + "bge 59f\n" + "tbz x9, #3, 54f\n" + "str d16, [x26], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x9, #2, 52f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz x9, #1, 51f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x21]\n" + "b 58f\n" + "51:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x9, #0, 58f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x21]\n" + "b 58f\n" + "52:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x9, #1, 53f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x21]\n" + "b 58f\n" + "53:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x9, #0, 58f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x21]\n" + "b 58f\n" + "54:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x9, #2, 56f\n" + "str s16, [x26], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz x9, #1, 55f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x21]\n" + "b 58f\n" + "55:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x9, #0, 58f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x21]\n" + "b 58f\n" + "56:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x9, #1, 57f\n" + "str h16, [x26], #0x2\n" + "str h20, [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x21]\n" + "b 58f\n" + "57:" // Height 2: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "str b20, [x21, #0x0]\n" + "58:" // Height 2: Partial direct writeback: Done "b 60f\n" - "58:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x12, #1, 59f\n" - "str h16, [x9], #0x2\n" - "str h20, [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[2], [x9]\n" - "st1 { v20.b }[2], [x25]\n" - "b 60f\n" - "59:" // Height 2: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "str b20, [x25, #0x0]\n" - "60:" // Height 2: Partial direct writeback: Done - "b 62f\n" - "61:" // Height 2: Full writeback - "str q16, [x9, #0x0]\n" - "str q20, [x25, #0x0]\n" - "add x9, x9, #0x10\n" - "add x25, x25, #0x10\n" - "62:" // Height 2: Writeback done - "subs x12, x12, #0x10\n" - "bgt 34b\n" - "b 126f\n" - "63:" // Height 3 + "59:" // Height 2: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x21, #0x0]\n" + "60:" // Height 2: Writeback done + "subs x9, x9, #0x10\n" + "bgt 32b\n" + "b 122f\n" + "61:" // Height 3 "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "movi v14.4s, #0x0\n" + "mov x26, %x[output_ptr]\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 64f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "b 65f\n" - "64:" // Height 3: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "65:" // Height 3: Column loop + "62:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -846,85 +814,169 @@ void a64_hybrid_s8qa_dot_4x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "66:" // Height 3: setup done - "mov x28, #0x0\n" - "67:" // Height 3: String loop + "63:" // Height 3: setup done + "mov x25, #0x0\n" + "64:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 68f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 65f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" - "cbnz x28, 69f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 66f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 69f\n" - "68:" // Height 3: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "69:" // Height 3: input setup done - "cmp x27, #0x10\n" - "blt 74f\n" - "cmp x27, #0x20\n" - "blt 72f\n" - "70:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "add x21, x21, x19\n" + "b 66f\n" + "65:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "66:" // Height 3: input setup done + "cmp x24, #0x10\n" + "blt 71f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "blt 69f\n" + "67:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q7, [x11, #0x30]\n" - "ldr q8, [x11, #0x40]\n" + "ldr q7, [x28, #0x30]\n" + "add x21, x21, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x60]\n" + "ldr q9, [x28, #0x50]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q10, [x28, #0x60]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x80]\n" + "ldr q4, [x28, #0x70]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x11, #0x90]\n" + "ldr q6, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + "ldr q10, [x28, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "68:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "bge 67b\n" + "69:" // Height 3: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x28, #0x30]\n" "add x22, x22, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q8, [x28, #0x40]\n" + "add x21, x21, #0x10\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + "ldr q10, [x28, #0x60]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q9, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x11, #0xd0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x11, #0xe0]\n" + "ldr q4, [x28, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" @@ -945,181 +997,100 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 71f\n" + "tbnz %x[flags], #31, 70f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "71:" // Height 3: Multiply loop: unique 9: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x20\n" + "70:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 70b\n" - "72:" // Height 3: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "71:" // Height 3: Multiply loop: Main loop skip + "cbz x24, 78f\n" + "cmp x24, #0x4\n" + "blt 74f\n" + "72:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "tbnz %x[flags], #31, 73f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + "73:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x20]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" - "ldr q9, [x11, #0x30]\n" - "ldr q10, [x11, #0x40]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q4, [x11, #0x50]\n" ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - "ldr q5, [x11, #0x60]\n" ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" - "ldr q6, [x11, #0x70]\n" ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x80]\n" ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" - "ldr q8, [x11, #0x90]\n" ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" - ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n" - ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n" - "ldr q5, [x11, #0xd0]\n" - ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" - ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n" - ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 73f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 10: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "74:" // Height 3: Multiply loop: Main loop skip - "cbz x27, 81f\n" - "cmp x27, #0x4\n" - "blt 77f\n" - "75:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x24], #0x4\n" - "ldr s2, [x22], #0x4\n" - "tbnz %x[flags], #31, 76f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "76:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x20]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q4, [x11, #0x30]\n" - "sub x27, x27, #0x4\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "cmp x27, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" - "bge 75b\n" - "cbz x27, 81f\n" - "77:" // Height 3: Multiply loop: Skip odd blocks - "tbz x27, #1, 78f\n" - "ldr h0, [x26], #0x2\n" - "ldr h1, [x24], #0x2\n" - "ldr h2, [x22], #0x2\n" - "tbz x27, #0, 79f\n" - "ld1 { v0.b }[2], [x26]\n" - "ld1 { v1.b }[2], [x24]\n" - "ld1 { v2.b }[2], [x22]\n" - "b 79f\n" - "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "ldr b1, [x24, #0x0]\n" - "ldr b2, [x22, #0x0]\n" - "79:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 80f\n" + "bge 72b\n" + "cbz x24, 78f\n" + "74:" // Height 3: Multiply loop: Skip odd blocks + "tbz x24, #1, 75f\n" + "ldr h0, [x23], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "tbz x24, #0, 76f\n" + "ld1 { v0.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" + "b 76f\n" + "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" + "76:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 77f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - "80:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n" - "ldr q8, [x11, #0x30]\n" - "add x11, x11, #0x40\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n" - "81:" // Height 3: Multiply loop: No odd multiplies + "77:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + "ldr q6, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + "78:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 67b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 82f\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 64b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbnz %x[flags], #31, 79f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x19]\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v11.4s, v11.4s, v11.4s\n" @@ -1129,24 +1100,24 @@ void a64_hybrid_s8qa_dot_4x16 ( "mul v11.4s, v11.4s, v3.4s\n" "mul v12.4s, v12.4s, v3.4s\n" "mul v13.4s, v13.4s, v3.4s\n" - "82:" // Height 3: skip row sum fixup + "79:" // Height 3: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v20.4s, v20.4s, v12.4s\n" - "ldr q3, [x10, #0x30]\n" + "ld1r { v4.4s }, [x22]\n" "add v21.4s, v21.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" "add v22.4s, v22.4s, v12.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" "add v23.4s, v23.4s, v12.4s\n" - "add x10, x10, #0x40\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" @@ -1160,7 +1131,7 @@ void a64_hybrid_s8qa_dot_4x16 ( "add v22.4s, v22.4s, v2.4s\n" "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" "add v25.4s, v25.4s, v1.4s\n" "add v26.4s, v26.4s, v2.4s\n" "add v27.4s, v27.4s, v3.4s\n" @@ -1176,7 +1147,7 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v25.4s, v25.4s, v4.4s\n" "sqrdmulh v26.4s, v26.4s, v4.4s\n" "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "tbz %x[flags], #5, 83f\n" + "tbz %x[flags], #5, 80f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -1213,18 +1184,18 @@ void a64_hybrid_s8qa_dot_4x16 ( "sshr v8.4s, v8.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v7.4s\n" "sqadd v27.4s, v27.4s, v8.4s\n" - "83:" // Height 3: no shift correction + "80:" // Height 3: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" @@ -1278,122 +1249,106 @@ void a64_hybrid_s8qa_dot_4x16 ( "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v20.16b, v20.16b, v21.16b\n" "uzp1 v24.16b, v24.16b, v25.16b\n" - "bge 92f\n" - "tbz x12, #3, 87f\n" - "str d16, [x9], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x12, #2, 85f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x12, #1, 84f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[14], [x9]\n" - "st1 { v20.b }[14], [x25]\n" - "st1 { v24.b }[14], [x23]\n" - "b 91f\n" - "84:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x12, #0, 91f\n" - "st1 { v16.b }[12], [x9]\n" - "st1 { v20.b }[12], [x25]\n" - "st1 { v24.b }[12], [x23]\n" - "b 91f\n" - "85:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x12, #1, 86f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[10], [x9]\n" - "st1 { v20.b }[10], [x25]\n" - "st1 { v24.b }[10], [x23]\n" - "b 91f\n" - "86:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x12, #0, 91f\n" - "st1 { v16.b }[8], [x9]\n" - "st1 { v20.b }[8], [x25]\n" - "st1 { v24.b }[8], [x23]\n" - "b 91f\n" - "87:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x12, #2, 89f\n" - "str s16, [x9], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x12, #1, 88f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[6], [x9]\n" - "st1 { v20.b }[6], [x25]\n" - "st1 { v24.b }[6], [x23]\n" - "b 91f\n" - "88:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x12, #0, 91f\n" - "st1 { v16.b }[4], [x9]\n" - "st1 { v20.b }[4], [x25]\n" - "st1 { v24.b }[4], [x23]\n" - "b 91f\n" - "89:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x12, #1, 90f\n" - "str h16, [x9], #0x2\n" - "str h20, [x25], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[2], [x9]\n" - "st1 { v20.b }[2], [x25]\n" - "st1 { v24.b }[2], [x23]\n" - "b 91f\n" - "90:" // Height 3: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "str b20, [x25, #0x0]\n" - "str b24, [x23, #0x0]\n" - "91:" // Height 3: Partial direct writeback: Done - "b 93f\n" - "92:" // Height 3: Full writeback - "str q16, [x9, #0x0]\n" - "str q20, [x25, #0x0]\n" - "str q24, [x23, #0x0]\n" - "add x9, x9, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "93:" // Height 3: Writeback done - "subs x12, x12, #0x10\n" - "bgt 65b\n" - "b 126f\n" - "94:" // Height 4 + "bge 89f\n" + "tbz x9, #3, 84f\n" + "str d16, [x26], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x9, #2, 82f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x9, #1, 81f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 88f\n" + "81:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x9, #0, 88f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 88f\n" + "82:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x9, #1, 83f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 88f\n" + "83:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x9, #0, 88f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 88f\n" + "84:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x9, #2, 86f\n" + "str s16, [x26], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x9, #1, 85f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 88f\n" + "85:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x9, #0, 88f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 88f\n" + "86:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x9, #1, 87f\n" + "str h16, [x26], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 88f\n" + "87:" // Height 3: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "88:" // Height 3: Partial direct writeback: Done + "b 90f\n" + "89:" // Height 3: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "90:" // Height 3: Writeback done + "subs x9, x9, #0x10\n" + "bgt 62b\n" + "b 122f\n" + "91:" // Height 4 "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" "movi v14.4s, #0x0\n" + "mov x19, #0x4\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 95f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "ldr x21, [%x[output_ptr], #0x18]\n" - "add x25, x25, x19\n" - "add %x[output_ptr], %x[output_ptr], #0x20\n" - "add x23, x23, x19\n" - "add x21, x21, x19\n" - "b 96f\n" - "95:" // Height 4: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "add x21, x23, x19\n" - "add %x[output_ptr], x21, x19\n" - "96:" // Height 4: Column loop + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "92:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1410,99 +1365,99 @@ void a64_hybrid_s8qa_dot_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "97:" // Height 4: setup done - "mov x28, #0x0\n" - "98:" // Height 4: String loop + "93:" // Height 4: setup done + "mov x25, #0x0\n" + "94:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 99f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 95f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" "ldr x20, [x20, #0x18]\n" - "cbnz x28, 100f\n" + "cbnz x25, 96f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 100f\n" - "99:" // Height 4: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "100:" // Height 4: input setup done - "cmp x27, #0x10\n" - "blt 105f\n" - "cmp x27, #0x20\n" - "blt 103f\n" - "101:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" + "b 96f\n" + "95:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "96:" // Height 4: input setup done + "cmp x24, #0x10\n" + "blt 101f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q2, [x21, #0x0]\n" "ldr q3, [x20, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "blt 99f\n" + "97:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q7, [x11, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x21, x21, #0x10\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q8, [x11, #0x40]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" + "add x20, x20, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x60]\n" + "ldr q9, [x28, #0x50]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q10, [x28, #0x60]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "add x26, x26, #0x10\n" + "ldr q4, [x28, #0x70]\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x11, #0x80]\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "add x20, x20, #0x10\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x11, #0x90]\n" + "ldr q6, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q9, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x11, #0xd0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x11, #0xe0]\n" + "ldr q4, [x28, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" @@ -1530,218 +1485,222 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 102f\n" + "tbnz %x[flags], #31, 98f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "102:" // Height 4: Multiply loop: unique 13: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x20\n" + "98:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" - "bge 101b\n" - "103:" // Height 4: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" "ldr q3, [x20, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "bge 97b\n" + "99:" // Height 4: Multiply loop: Single iteration only + ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" + "ldr q8, [x28, #0x40]\n" + "add x21, x21, #0x10\n" + ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + "add x20, x20, #0x10\n" + ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q10, [x28, #0x60]\n" + ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" + "ldr q10, [x28, #0xd0]\n" + ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n" + ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 100f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "100:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "101:" // Height 4: Multiply loop: Main loop skip + "cbz x24, 108f\n" + "cmp x24, #0x4\n" + "blt 104f\n" + "102:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s3, [x20], #0x4\n" + "tbnz %x[flags], #31, 103f\n" + ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" + ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" + ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" + ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" + "103:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x20]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" - "ldr q9, [x11, #0x30]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" - "ldr q10, [x11, #0x40]\n" - "ldr q4, [x11, #0x50]\n" ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x60]\n" ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x70]\n" ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" - "add x26, x26, #0x10\n" ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x11, #0x80]\n" ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" - "add x20, x20, #0x10\n" ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" - "ldr q8, [x11, #0x90]\n" ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n" - ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n" - ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n" - ".inst 0x4fa3e15c // sdot v28.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n" - ".inst 0x4fa3e09d // sdot v29.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n" - ".inst 0x4fa3e0be // sdot v30.4s, v5.16b, v3.4b[1]\n" - "ldr q5, [x11, #0xd0]\n" - ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0df // sdot v31.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8fc // sdot v28.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" - ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4f83e91d // sdot v29.4s, v8.16b, v3.4b[2]\n" - ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n" - ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n" - ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n" - ".inst 0x4f83e93e // sdot v30.4s, v9.16b, v3.4b[2]\n" - ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n" - ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n" - ".inst 0x4f83e95f // sdot v31.4s, v10.16b, v3.4b[2]\n" - ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4fa3e89c // sdot v28.4s, v4.16b, v3.4b[3]\n" - ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n" - ".inst 0x4fa3e8bd // sdot v29.4s, v5.16b, v3.4b[3]\n" - ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8de // sdot v30.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8ff // sdot v31.4s, v7.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 104f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "104:" // Height 4: Multiply loop: unique 14: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "105:" // Height 4: Multiply loop: Main loop skip - "cbz x27, 112f\n" - "cmp x27, #0x4\n" - "blt 108f\n" - "106:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x24], #0x4\n" - "ldr s2, [x22], #0x4\n" - "ldr s3, [x20], #0x4\n" - "tbnz %x[flags], #31, 107f\n" - ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" - ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" - ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" - ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x20]\n" - ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q4, [x11, #0x30]\n" - ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" - "sub x27, x27, #0x4\n" - "add x11, x11, #0x40\n" - ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" - "cmp x27, #0x4\n" - ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" - "bge 106b\n" - "cbz x27, 112f\n" - "108:" // Height 4: Multiply loop: Skip odd blocks - "tbz x27, #1, 109f\n" - "ldr h0, [x26], #0x2\n" - "ldr h1, [x24], #0x2\n" - "ldr h2, [x22], #0x2\n" + "bge 102b\n" + "cbz x24, 108f\n" + "104:" // Height 4: Multiply loop: Skip odd blocks + "tbz x24, #1, 105f\n" + "ldr h0, [x23], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" "ldr h3, [x20], #0x2\n" - "tbz x27, #0, 110f\n" - "ld1 { v0.b }[2], [x26]\n" - "ld1 { v1.b }[2], [x24]\n" - "ld1 { v2.b }[2], [x22]\n" + "tbz x24, #0, 106f\n" + "ld1 { v0.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" "ld1 { v3.b }[2], [x20]\n" - "b 110f\n" - "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "ldr b1, [x24, #0x0]\n" - "ldr b2, [x22, #0x0]\n" + "b 106f\n" + "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" "ldr b3, [x20, #0x0]\n" - "110:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 111f\n" + "106:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 107f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" - "111:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n" - "ldr q8, [x11, #0x30]\n" - ".inst 0x4f83e0bc // sdot v28.4s, v5.16b, v3.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fe // sdot v30.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f83e11f // sdot v31.4s, v8.16b, v3.4b[0]\n" - "112:" // Height 4: Multiply loop: No odd multiplies + "107:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" + "ldr q6, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n" + "108:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 98b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 94b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" "prfm pstl1keep, [x21, #0x0]\n" - "tbnz %x[flags], #31, 113f\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "prfm pstl1keep, [x19, #0x0]\n" + "tbnz %x[flags], #31, 109f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" @@ -1754,24 +1713,24 @@ void a64_hybrid_s8qa_dot_4x16 ( "mul v12.4s, v12.4s, v4.4s\n" "mul v13.4s, v13.4s, v4.4s\n" "mul v14.4s, v14.4s, v4.4s\n" - "113:" // Height 4: skip row sum fixup + "109:" // Height 4: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v20.4s, v20.4s, v12.4s\n" - "ldr q3, [x10, #0x30]\n" + "ld1r { v4.4s }, [x22]\n" "add v21.4s, v21.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" "add v22.4s, v22.4s, v12.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" "add v23.4s, v23.4s, v12.4s\n" - "add x10, x10, #0x40\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" @@ -1793,7 +1752,7 @@ void a64_hybrid_s8qa_dot_4x16 ( "add v26.4s, v26.4s, v2.4s\n" "add v27.4s, v27.4s, v3.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" "add v29.4s, v29.4s, v1.4s\n" "add v30.4s, v30.4s, v2.4s\n" "add v31.4s, v31.4s, v3.4s\n" @@ -1813,7 +1772,7 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqrdmulh v29.4s, v29.4s, v4.4s\n" "sqrdmulh v30.4s, v30.4s, v4.4s\n" "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "tbz %x[flags], #5, 114f\n" + "tbz %x[flags], #5, 110f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -1862,18 +1821,18 @@ void a64_hybrid_s8qa_dot_4x16 ( "sqadd v29.4s, v29.4s, v10.4s\n" "sqadd v30.4s, v30.4s, v4.4s\n" "sqadd v31.4s, v31.4s, v5.4s\n" - "114:" // Height 4: no shift correction + "110:" // Height 4: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" @@ -1946,125 +1905,122 @@ void a64_hybrid_s8qa_dot_4x16 ( "uzp1 v20.16b, v20.16b, v21.16b\n" "uzp1 v24.16b, v24.16b, v25.16b\n" "uzp1 v28.16b, v28.16b, v29.16b\n" - "bge 123f\n" - "tbz x12, #3, 118f\n" - "str d16, [x9], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x12, #2, 116f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x21], #0x4\n" - "tbz x12, #1, 115f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[14], [x9]\n" - "st1 { v20.b }[14], [x25]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x21]\n" - "b 122f\n" - "115:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x12, #0, 122f\n" - "st1 { v16.b }[12], [x9]\n" - "st1 { v20.b }[12], [x25]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x21]\n" - "b 122f\n" - "116:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x12, #1, 117f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[10], [x9]\n" - "st1 { v20.b }[10], [x25]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x21]\n" - "b 122f\n" - "117:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x12, #0, 122f\n" - "st1 { v16.b }[8], [x9]\n" - "st1 { v20.b }[8], [x25]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x21]\n" - "b 122f\n" - "118:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x12, #2, 120f\n" - "str s16, [x9], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x21], #0x4\n" - "tbz x12, #1, 119f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[6], [x9]\n" - "st1 { v20.b }[6], [x25]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x21]\n" - "b 122f\n" - "119:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x12, #0, 122f\n" - "st1 { v16.b }[4], [x9]\n" - "st1 { v20.b }[4], [x25]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x21]\n" - "b 122f\n" - "120:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x12, #1, 121f\n" - "str h16, [x9], #0x2\n" - "str h20, [x25], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[2], [x9]\n" - "st1 { v20.b }[2], [x25]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x21]\n" - "b 122f\n" - "121:" // Height 4: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "str b20, [x25, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x21, #0x0]\n" - "122:" // Height 4: Partial direct writeback: Done - "b 124f\n" - "123:" // Height 4: Full writeback - "str q16, [x9, #0x0]\n" - "str q20, [x25, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x21, #0x0]\n" - "add x9, x9, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "add x21, x21, #0x10\n" - "124:" // Height 4: Writeback done - "subs x12, x12, #0x10\n" - "bgt 96b\n" + "bge 119f\n" + "tbz x9, #3, 114f\n" + "str d16, [x26], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x9, #2, 112f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v28.s }[2], [x19], #0x4\n" + "tbz x9, #1, 111f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v28.h }[6], [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "st1 { v28.b }[14], [x19]\n" + "b 118f\n" + "111:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x9, #0, 118f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "st1 { v28.b }[12], [x19]\n" + "b 118f\n" + "112:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x9, #1, 113f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v28.h }[4], [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "st1 { v28.b }[10], [x19]\n" + "b 118f\n" + "113:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x9, #0, 118f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "st1 { v28.b }[8], [x19]\n" + "b 118f\n" + "114:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x9, #2, 116f\n" + "str s16, [x26], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "str s28, [x19], #0x4\n" + "tbz x9, #1, 115f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v28.h }[2], [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "st1 { v28.b }[6], [x19]\n" + "b 118f\n" + "115:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x9, #0, 118f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "st1 { v28.b }[4], [x19]\n" + "b 118f\n" + "116:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x9, #1, 117f\n" + "str h16, [x26], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "str h28, [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "st1 { v28.b }[2], [x19]\n" + "b 118f\n" + "117:" // Height 4: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "str b28, [x19, #0x0]\n" + "118:" // Height 4: Partial direct writeback: Done + "b 120f\n" + "119:" // Height 4: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "str q28, [x19, #0x0]\n" + "120:" // Height 4: Writeback done + "subs x9, x9, #0x10\n" + "bgt 92b\n" "subs %x[M], %x[M], #0x4\n" - "beq 126f\n" + "beq 122f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 125f\n" + "tbz %x[flags], #3, 121f\n" "add x20, x20, #0x4\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "125:" // Update direct input + "121:" // Update direct input "mov x19, #0x4\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "126:" // Exit + "122:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp index 4ddc743f01..eb5bdfe55c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_s8qs_dot_6x16( ARGLIST ); +void a64_hybrid_s8qs_dot_6x16_a55( ARGLIST ); class cls_a64_hybrid_s8qs_dot_6x16 { @@ -72,10 +72,11 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: - return { 7.5301 }; + return { 8.28 }; default: return { 27.5482 }; } @@ -83,9 +84,15 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_s8qs_dot_6x16; - - cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *) + cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_hybrid_s8qs_dot_6x16_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp new file mode 100644 index 0000000000..6da3f7a9e3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp @@ -0,0 +1,3772 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_s8qs_dot_6x16_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base +) +{ + struct KernelArgs { + const int32_t *multiplier_ptr = {}; + const int32_t *shift_ptr = {}; + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->per_channel_requant) { + flags |= 0x10; + ka.multiplier_ptr=qp->per_channel_muls + col_base; + ka.shift_ptr=qp->per_channel_right_shifts + col_base; + } + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 136f\n" + "cmp %x[M], #0x4\n" + "bgt 109f\n" + "beq 82f\n" + "cmp %x[M], #0x2\n" + "bgt 55f\n" + "beq 28f\n" + "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x15, %x[col_bias]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "2:" // Height 1: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "3:" // Height 1: setup done + "mov x11, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "cbnz x11, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x9, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x10, #0x10\n" + "blt 9f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "cmp x10, #0x20\n" + "blt 8f\n" + "7:" // Height 1: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr d7, [x12, #0x10]\n" + "ldr x19, [x12, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x12, #0x20]\n" + "sub x10, x10, #0x10\n" + "ldr x28, [x12, #0x28]\n" + "cmp x10, #0x20\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x12, #0x38]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x12, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr d6, [x12, #0x40]\n" + "ldr x28, [x12, #0x48]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0x58]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x12, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x12, #0x60]\n" + "ldr x28, [x12, #0x68]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0x78]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x12, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x12, #0x80]\n" + "ldr x28, [x12, #0x88]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0x98]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x12, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x12, #0xa0]\n" + "ldr x28, [x12, #0xa8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0xb8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x12, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x12, #0xc0]\n" + "ldr x28, [x12, #0xc8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0xd8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x12, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x12, #0xe0]\n" + "ldr x28, [x12, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0xf8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr d7, [x12, #0xf0]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x12, #0x0]\n" + "ldr x28, [x12, #0x8]\n" + "mov v7.d[1], x19\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "mov v0.d[1], x27\n" + "bge 7b\n" + "8:" // Height 1: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q6, [x12, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x12, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x12, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x12, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x12, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x12, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x12, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x12, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x12, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x12, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x12, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "9:" // Height 1: Multiply loop: Main loop skip + "cbz x10, 14f\n" + "cmp x10, #0x4\n" + "blt 11f\n" + "10:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr q6, [x12, #0x0]\n" + "cmp x10, #0x4\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "bge 10b\n" + "cbz x10, 14f\n" + "11:" // Height 1: Multiply loop: Skip odd blocks + "tbz x10, #1, 12f\n" + "ldr h0, [x9], #0x2\n" + "tbz x10, #0, 13f\n" + "ld1 { v0.b }[2], [x9]\n" + "b 13f\n" + "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "13:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "14:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 4b\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x15, #0x0]\n" + "ldr q1, [x15, #0x10]\n" + "ldr q2, [x15, #0x20]\n" + "add v8.4s, v8.4s, v0.4s\n" + "ldr q3, [x15, #0x30]\n" + "add v9.4s, v9.4s, v1.4s\n" + "add x15, x15, #0x40\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "tbz %x[flags], #4, 15f\n" + "ldr q0, [x16, #0x0]\n" + "ldr q4, [x17, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "ldr q5, [x17, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "ldr q6, [x17, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "b 16f\n" + "15:" // Height 1: per layer parameters + "add x25, %x[qp], %[per_layer_right_shift]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v0.4s }, [x25]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "mov v2.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v3.16b, v0.16b\n" + "mov v7.16b, v4.16b\n" + "16:" // Height 1: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "tbz %x[flags], #5, 17f\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "17:" // Height 1: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "add x25, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x25]\n" + "cmp x14, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "bge 26f\n" + "tbz x14, #3, 21f\n" + "str d8, [x13], #0x8\n" + "tbz x14, #2, 19f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "tbz x14, #1, 18f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "tbz x14, #0, 25f\n" + "st1 { v8.b }[14], [x13]\n" + "b 25f\n" + "18:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x14, #0, 25f\n" + "st1 { v8.b }[12], [x13]\n" + "b 25f\n" + "19:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x14, #1, 20f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "tbz x14, #0, 25f\n" + "st1 { v8.b }[10], [x13]\n" + "b 25f\n" + "20:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x14, #0, 25f\n" + "st1 { v8.b }[8], [x13]\n" + "b 25f\n" + "21:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x14, #2, 23f\n" + "str s8, [x13], #0x4\n" + "tbz x14, #1, 22f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "tbz x14, #0, 25f\n" + "st1 { v8.b }[6], [x13]\n" + "b 25f\n" + "22:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x14, #0, 25f\n" + "st1 { v8.b }[4], [x13]\n" + "b 25f\n" + "23:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x14, #1, 24f\n" + "str h8, [x13], #0x2\n" + "tbz x14, #0, 25f\n" + "st1 { v8.b }[2], [x13]\n" + "b 25f\n" + "24:" // Height 1: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "25:" // Height 1: Partial direct writeback: Done + "b 27f\n" + "26:" // Height 1: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "27:" // Height 1: Writeback done + "subs x14, x14, #0x10\n" + "bgt 2b\n" + "b 164f\n" + "28:" // Height 2 + "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x15, %x[col_bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "29:" // Height 2: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "30:" // Height 2: setup done + "mov x11, #0x0\n" + "31:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 32f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "cbnz x11, 33f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "b 33f\n" + "32:" // Height 2: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "33:" // Height 2: input setup done + "cmp x10, #0x10\n" + "blt 36f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q6, [x12, #0x0]\n" + "blt 35f\n" + "34:" // Height 2: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr d7, [x12, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x12, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "sub x10, x10, #0x10\n" + "ldr d6, [x12, #0x20]\n" + "cmp x10, #0x20\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x28]\n" + "ldr x19, [x12, #0x38]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0x40]\n" + "ldr x28, [x12, #0x48]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr d7, [x12, #0x50]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x12, #0x58]\n" + "ldr x28, [x12, #0x68]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0x60]\n" + "ldr x19, [x12, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0x70]\n" + "ldr x28, [x12, #0x88]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0x80]\n" + "ldr x19, [x12, #0x98]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0x90]\n" + "ldr x28, [x12, #0xa8]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0xa0]\n" + "ldr x19, [x12, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0xb0]\n" + "ldr x28, [x12, #0xc8]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0xc0]\n" + "ldr x19, [x12, #0xd8]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0xd0]\n" + "ldr x28, [x12, #0xe8]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0xe0]\n" + "ldr x19, [x12, #0xf8]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0xf0]\n" + "ldr x27, [x9, #0x8]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr x25, [x26, #0x8]\n" + "ldr d6, [x12, #0x0]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr x28, [x12, #0x8]\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x25\n" + "mov v6.d[1], x28\n" + "bge 34b\n" + "35:" // Height 2: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q7, [x12, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x12, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q7, [x12, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x12, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x12, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x12, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x12, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x12, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x12, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x12, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x12, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x12, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x12, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x12, #0xf0]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "36:" // Height 2: Multiply loop: Main loop skip + "cbz x10, 41f\n" + "cmp x10, #0x4\n" + "blt 38f\n" + "37:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "bge 37b\n" + "cbz x10, 41f\n" + "38:" // Height 2: Multiply loop: Skip odd blocks + "tbz x10, #1, 39f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "tbz x10, #0, 40f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "b 40f\n" + "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "40:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "41:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 31b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x15, #0x0]\n" + "ldr q1, [x15, #0x10]\n" + "add x23, x13, x19\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "ldr q2, [x15, #0x20]\n" + "ldr q3, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "tbz %x[flags], #4, 42f\n" + "ldr q0, [x16, #0x0]\n" + "ldr q4, [x17, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "ldr q5, [x17, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "ldr q6, [x17, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "b 43f\n" + "42:" // Height 2: per layer parameters + "add x25, %x[qp], %[per_layer_right_shift]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v0.4s }, [x25]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "mov v2.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v3.16b, v0.16b\n" + "mov v7.16b, v4.16b\n" + "43:" // Height 2: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "tbz %x[flags], #5, 44f\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v4.16b, v12.16b, v0.16b\n" + "and v5.16b, v13.16b, v1.16b\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "44:" // Height 2: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "add x25, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x25]\n" + "cmp x14, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "bge 53f\n" + "tbz x14, #3, 48f\n" + "str d8, [x13], #0x8\n" + "str d12, [x23], #0x8\n" + "tbz x14, #2, 46f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "tbz x14, #1, 45f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "tbz x14, #0, 52f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x23]\n" + "b 52f\n" + "45:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x14, #0, 52f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x23]\n" + "b 52f\n" + "46:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x14, #1, 47f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "tbz x14, #0, 52f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x23]\n" + "b 52f\n" + "47:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x14, #0, 52f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x23]\n" + "b 52f\n" + "48:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x14, #2, 50f\n" + "str s8, [x13], #0x4\n" + "str s12, [x23], #0x4\n" + "tbz x14, #1, 49f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "tbz x14, #0, 52f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x23]\n" + "b 52f\n" + "49:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x14, #0, 52f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x23]\n" + "b 52f\n" + "50:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x14, #1, 51f\n" + "str h8, [x13], #0x2\n" + "str h12, [x23], #0x2\n" + "tbz x14, #0, 52f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x23]\n" + "b 52f\n" + "51:" // Height 2: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x23, #0x0]\n" + "52:" // Height 2: Partial direct writeback: Done + "b 54f\n" + "53:" // Height 2: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q12, [x23, #0x0]\n" + "54:" // Height 2: Writeback done + "subs x14, x14, #0x10\n" + "bgt 29b\n" + "b 164f\n" + "55:" // Height 3 + "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x15, %x[col_bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "56:" // Height 3: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "57:" // Height 3: setup done + "mov x11, #0x0\n" + "58:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "cbnz x11, 60f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 60f\n" + "59:" // Height 3: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "60:" // Height 3: input setup done + "cmp x10, #0x10\n" + "blt 63f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "blt 62f\n" + "61:" // Height 3: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr d7, [x12, #0x10]\n" + "ldr x19, [x12, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x12, #0x20]\n" + "add x26, x26, #0x10\n" + "ldr x28, [x12, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x12, #0x38]\n" + "sub x10, x10, #0x10\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr d7, [x12, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "cmp x10, #0x20\n" + "ldr d6, [x12, #0x40]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x48]\n" + "ldr x19, [x12, #0x58]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr d7, [x12, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0x60]\n" + "ldr x28, [x12, #0x68]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x12, #0x70]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x12, #0x78]\n" + "ldr x28, [x12, #0x88]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr d6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x12, #0x90]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x12, #0x98]\n" + "ldr x28, [x12, #0xa8]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x12, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x12, #0xb0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x12, #0xb8]\n" + "ldr x28, [x12, #0xc8]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x12, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x12, #0xd0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x12, #0xd8]\n" + "ldr x28, [x12, #0xe8]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr d6, [x12, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr d7, [x12, #0xf0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x12, #0xf8]\n" + "ldr x27, [x9, #0x8]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + "ldr x25, [x26, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr x21, [x24, #0x8]\n" + "mov v1.d[1], x25\n" + "ldr d6, [x12, #0x0]\n" + "ldr x28, [x12, #0x8]\n" + "mov v2.d[1], x21\n" + "mov v6.d[1], x28\n" + "bge 61b\n" + "62:" // Height 3: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q7, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr q7, [x12, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x12, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x12, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q6, [x12, #0x60]\n" + "ldr q7, [x12, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q6, [x12, #0x80]\n" + "ldr q7, [x12, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q6, [x12, #0xa0]\n" + "ldr q7, [x12, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q6, [x12, #0xc0]\n" + "ldr q7, [x12, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q6, [x12, #0xe0]\n" + "ldr q7, [x12, #0xf0]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "63:" // Height 3: Multiply loop: Main loop skip + "cbz x10, 68f\n" + "cmp x10, #0x4\n" + "blt 65f\n" + "64:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "bge 64b\n" + "cbz x10, 68f\n" + "65:" // Height 3: Multiply loop: Skip odd blocks + "tbz x10, #1, 66f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "tbz x10, #0, 67f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "b 67f\n" + "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "67:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "68:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x15, #0x0]\n" + "ldr q1, [x15, #0x10]\n" + "add x23, x13, x19\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "prfm pstl1keep, [x22, #0x0]\n" + "ldr q2, [x15, #0x20]\n" + "ldr q3, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "tbz %x[flags], #4, 69f\n" + "ldr q0, [x16, #0x0]\n" + "ldr q4, [x17, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "ldr q5, [x17, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "ldr q6, [x17, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "b 70f\n" + "69:" // Height 3: per layer parameters + "add x25, %x[qp], %[per_layer_right_shift]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v0.4s }, [x25]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "mov v2.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v3.16b, v0.16b\n" + "mov v7.16b, v4.16b\n" + "70:" // Height 3: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "tbz %x[flags], #5, 71f\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "71:" // Height 3: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "add x25, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x25]\n" + "cmp x14, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 80f\n" + "tbz x14, #3, 75f\n" + "str d8, [x13], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x14, #2, 73f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "tbz x14, #1, 72f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "tbz x14, #0, 79f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "b 79f\n" + "72:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x14, #0, 79f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "b 79f\n" + "73:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x14, #1, 74f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "tbz x14, #0, 79f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "b 79f\n" + "74:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x14, #0, 79f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "b 79f\n" + "75:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x14, #2, 77f\n" + "str s8, [x13], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "tbz x14, #1, 76f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "tbz x14, #0, 79f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "b 79f\n" + "76:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x14, #0, 79f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "b 79f\n" + "77:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x14, #1, 78f\n" + "str h8, [x13], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "tbz x14, #0, 79f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "b 79f\n" + "78:" // Height 3: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "79:" // Height 3: Partial direct writeback: Done + "b 81f\n" + "80:" // Height 3: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "81:" // Height 3: Writeback done + "subs x14, x14, #0x10\n" + "bgt 56b\n" + "b 164f\n" + "82:" // Height 4 + "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x15, %x[col_bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "83:" // Height 4: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "84:" // Height 4: setup done + "mov x11, #0x0\n" + "85:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 86f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "cbnz x11, 87f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 87f\n" + "86:" // Height 4: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "87:" // Height 4: input setup done + "cmp x10, #0x10\n" + "blt 90f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "blt 89f\n" + "88:" // Height 4: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr d7, [x12, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x12, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr d6, [x12, #0x20]\n" + "add x23, x23, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x28]\n" + "ldr x19, [x12, #0x38]\n" + "sub x10, x10, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr d7, [x12, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr d6, [x12, #0x40]\n" + "cmp x10, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr x28, [x12, #0x48]\n" + "ldr d7, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" + "mov v6.d[1], x28\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr x28, [x12, #0x68]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0x60]\n" + "ldr x19, [x12, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr d7, [x12, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr x28, [x12, #0x88]\n" + "ldr d7, [x12, #0x90]\n" + "ldr x19, [x12, #0x98]\n" + "mov v6.d[1], x28\n" + "ldr x28, [x12, #0xa8]\n" + "ldr x27, [x9, #0x8]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0xa0]\n" + "ldr x19, [x12, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr d7, [x12, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0xc0]\n" + "ldr x28, [x12, #0xc8]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0xd0]\n" + "ldr x19, [x12, #0xd8]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0xe0]\n" + "ldr x28, [x12, #0xe8]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x12, #0xf0]\n" + "ldr x19, [x12, #0xf8]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x12, #0x0]\n" + "ldr x28, [x12, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr x25, [x26, #0x8]\n" + "mov v0.d[1], x27\n" + "ldr d2, [x24, #0x0]\n" + "ldr x21, [x24, #0x8]\n" + "mov v1.d[1], x25\n" + "ldr d3, [x23, #0x0]\n" + "ldr x19, [x23, #0x8]\n" + "mov v2.d[1], x21\n" + "mov v3.d[1], x19\n" + "bge 88b\n" + "89:" // Height 4: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q7, [x12, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x12, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q7, [x12, #0x30]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" + "ldr q6, [x12, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q7, [x12, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x12, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q7, [x12, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x12, #0x90]\n" + "ldr q6, [x12, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x12, #0xb0]\n" + "ldr q6, [x12, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q7, [x12, #0xd0]\n" + "ldr q6, [x12, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + "ldr q7, [x12, #0xf0]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "90:" // Height 4: Multiply loop: Main loop skip + "cbz x10, 95f\n" + "cmp x10, #0x4\n" + "blt 92f\n" + "91:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "bge 91b\n" + "cbz x10, 95f\n" + "92:" // Height 4: Multiply loop: Skip odd blocks + "tbz x10, #1, 93f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "tbz x10, #0, 94f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "b 94f\n" + "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "94:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "95:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 85b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x15, #0x0]\n" + "ldr q1, [x15, #0x10]\n" + "add x23, x13, x19\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "ldr q2, [x15, #0x20]\n" + "ldr q3, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "tbz %x[flags], #4, 96f\n" + "ldr q0, [x16, #0x0]\n" + "ldr q4, [x17, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "ldr q5, [x17, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "ldr q6, [x17, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "b 97f\n" + "96:" // Height 4: per layer parameters + "add x25, %x[qp], %[per_layer_right_shift]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v0.4s }, [x25]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "mov v2.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v3.16b, v0.16b\n" + "mov v7.16b, v4.16b\n" + "97:" // Height 4: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v5.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sqrdmulh v23.4s, v23.4s, v7.4s\n" + "tbz %x[flags], #5, 98f\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "and v5.16b, v21.16b, v1.16b\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v6.16b, v22.16b, v2.16b\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "98:" // Height 4: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "srshl v22.4s, v22.4s, v2.4s\n" + "srshl v23.4s, v23.4s, v3.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "add x25, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x25]\n" + "cmp x14, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 107f\n" + "tbz x14, #3, 102f\n" + "str d8, [x13], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x14, #2, 100f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz x14, #1, 99f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "tbz x14, #0, 106f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "st1 { v20.b }[14], [x21]\n" + "b 106f\n" + "99:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x14, #0, 106f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "st1 { v20.b }[12], [x21]\n" + "b 106f\n" + "100:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x14, #1, 101f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "tbz x14, #0, 106f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "st1 { v20.b }[10], [x21]\n" + "b 106f\n" + "101:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x14, #0, 106f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "st1 { v20.b }[8], [x21]\n" + "b 106f\n" + "102:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x14, #2, 104f\n" + "str s8, [x13], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz x14, #1, 103f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "tbz x14, #0, 106f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "st1 { v20.b }[6], [x21]\n" + "b 106f\n" + "103:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x14, #0, 106f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "st1 { v20.b }[4], [x21]\n" + "b 106f\n" + "104:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x14, #1, 105f\n" + "str h8, [x13], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "str h20, [x21], #0x2\n" + "tbz x14, #0, 106f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "st1 { v20.b }[2], [x21]\n" + "b 106f\n" + "105:" // Height 4: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "str b20, [x21, #0x0]\n" + "106:" // Height 4: Partial direct writeback: Done + "b 108f\n" + "107:" // Height 4: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "str q20, [x21, #0x0]\n" + "108:" // Height 4: Writeback done + "subs x14, x14, #0x10\n" + "bgt 83b\n" + "b 164f\n" + "109:" // Height 5 + "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x15, %x[col_bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "110:" // Height 5: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "111:" // Height 5: setup done + "mov x11, #0x0\n" + "112:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 113f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x11, 114f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 114f\n" + "113:" // Height 5: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "114:" // Height 5: input setup done + "cmp x10, #0x10\n" + "blt 117f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "blt 116f\n" + "115:" // Height 5: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr d7, [x12, #0x10]\n" + "ldr x19, [x12, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x12, #0x20]\n" + "add x26, x26, #0x10\n" + "ldr x28, [x12, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x12, #0x38]\n" + "add x23, x23, #0x10\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr d7, [x12, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x12, #0x40]\n" + "sub x10, x10, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x48]\n" + "ldr x19, [x12, #0x58]\n" + "cmp x10, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr d7, [x12, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr d6, [x12, #0x60]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x68]\n" + "ldr x19, [x12, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr d7, [x12, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x12, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x88]\n" + "ldr x19, [x12, #0x98]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr d7, [x12, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d6, [x12, #0xa0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0xa8]\n" + "ldr x19, [x12, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr d7, [x12, #0xb0]\n" + "ldr d6, [x12, #0xc0]\n" + "ldr x28, [x12, #0xc8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0xd8]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr d7, [x12, #0xd0]\n" + "ldr d6, [x12, #0xe0]\n" + "ldr x28, [x12, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0xf8]\n" + "ldr x25, [x26, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + "ldr d7, [x12, #0xf0]\n" + "ldr x21, [x24, #0x8]\n" + "add x12, x12, #0x100\n" + "ldr d6, [x12, #0x0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x8]\n" + "ldr x19, [x23, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr d3, [x23, #0x0]\n" + "mov v1.d[1], x25\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x21\n" + "ldr x21, [x22, #0x8]\n" + "mov v3.d[1], x19\n" + "mov v4.d[1], x21\n" + "bge 115b\n" + "116:" // Height 5: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q7, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr q7, [x12, #0x30]\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x22, x22, #0x10\n" + "ldr q6, [x12, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q7, [x12, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x12, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q7, [x12, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q7, [x12, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr q6, [x12, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q7, [x12, #0xb0]\n" + "ldr q6, [x12, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q7, [x12, #0xd0]\n" + "ldr q6, [x12, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + "ldr q7, [x12, #0xf0]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "117:" // Height 5: Multiply loop: Main loop skip + "cbz x10, 122f\n" + "cmp x10, #0x4\n" + "blt 119f\n" + "118:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "bge 118b\n" + "cbz x10, 122f\n" + "119:" // Height 5: Multiply loop: Skip odd blocks + "tbz x10, #1, 120f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x10, #0, 121f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 121f\n" + "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "121:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "122:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 112b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x15, #0x0]\n" + "ldr q1, [x15, #0x10]\n" + "add x23, x13, x19\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "ldr q2, [x15, #0x20]\n" + "ldr q3, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "tbz %x[flags], #4, 123f\n" + "ldr q0, [x16, #0x0]\n" + "ldr q4, [x17, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "ldr q5, [x17, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "ldr q6, [x17, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "b 124f\n" + "123:" // Height 5: per layer parameters + "add x25, %x[qp], %[per_layer_right_shift]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v0.4s }, [x25]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "mov v2.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v3.16b, v0.16b\n" + "mov v7.16b, v4.16b\n" + "124:" // Height 5: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v5.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sqrdmulh v23.4s, v23.4s, v7.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v5.4s\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "sqrdmulh v27.4s, v27.4s, v7.4s\n" + "tbz %x[flags], #5, 125f\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "and v5.16b, v21.16b, v1.16b\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v6.16b, v22.16b, v2.16b\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v24.16b, v0.16b\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "and v5.16b, v25.16b, v1.16b\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "and v6.16b, v26.16b, v2.16b\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v27.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v7.4s\n" + "125:" // Height 5: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "srshl v22.4s, v22.4s, v2.4s\n" + "srshl v23.4s, v23.4s, v3.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" + "srshl v27.4s, v27.4s, v3.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "add x25, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x25]\n" + "cmp x14, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 134f\n" + "tbz x14, #3, 129f\n" + "str d8, [x13], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x14, #2, 127f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x14, #1, 126f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x14, #0, 133f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 133f\n" + "126:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x14, #0, 133f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 133f\n" + "127:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x14, #1, 128f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x14, #0, 133f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 133f\n" + "128:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x14, #0, 133f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 133f\n" + "129:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x14, #2, 131f\n" + "str s8, [x13], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x14, #1, 130f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x14, #0, 133f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 133f\n" + "130:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x14, #0, 133f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 133f\n" + "131:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x14, #1, 132f\n" + "str h8, [x13], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x14, #0, 133f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 133f\n" + "132:" // Height 5: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "133:" // Height 5: Partial direct writeback: Done + "b 135f\n" + "134:" // Height 5: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "135:" // Height 5: Writeback done + "subs x14, x14, #0x10\n" + "bgt 110b\n" + "b 164f\n" + "136:" // Height 6 + "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x15, %x[col_bias]\n" + "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x20, #0x6\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "137:" // Height 6: Column loop + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "138:" // Height 6: setup done + "mov x11, #0x0\n" + "139:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 140f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x11, 141f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 141f\n" + "140:" // Height 6: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x20, x22, x19\n" + "141:" // Height 6: input setup done + "cmp x10, #0x10\n" + "blt 144f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "blt 143f\n" + "142:" // Height 6: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr d7, [x12, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x12, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr d6, [x12, #0x20]\n" + "add x23, x23, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x28]\n" + "ldr x19, [x12, #0x38]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr d7, [x12, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x20, x20, #0x10\n" + "ldr d6, [x12, #0x40]\n" + "sub x10, x10, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x48]\n" + "ldr x19, [x12, #0x58]\n" + "cmp x10, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr d7, [x12, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr d6, [x12, #0x60]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x68]\n" + "ldr x19, [x12, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr d7, [x12, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x12, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x88]\n" + "ldr x19, [x12, #0x98]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr d7, [x12, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d6, [x12, #0xa0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0xa8]\n" + "ldr x19, [x12, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr d7, [x12, #0xb0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr d6, [x12, #0xc0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0xc8]\n" + "ldr x19, [x12, #0xd8]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr d7, [x12, #0xd0]\n" + "ldr d6, [x12, #0xe0]\n" + "ldr x28, [x12, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x12, #0xf8]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + "ldr d7, [x12, #0xf0]\n" + "ldr x25, [x26, #0x8]\n" + "add x12, x12, #0x100\n" + "ldr d6, [x12, #0x0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x12, #0x8]\n" + "ldr x21, [x24, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr d3, [x23, #0x0]\n" + "mov v1.d[1], x25\n" + "ldr x19, [x23, #0x8]\n" + "mov v2.d[1], x21\n" + "ldr d4, [x22, #0x0]\n" + "ldr x21, [x22, #0x8]\n" + "mov v3.d[1], x19\n" + "ldr d5, [x20, #0x0]\n" + "ldr x19, [x20, #0x8]\n" + "mov v4.d[1], x21\n" + "mov v5.d[1], x19\n" + "bge 142b\n" + "143:" // Height 6: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q7, [x12, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x12, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q7, [x12, #0x30]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" + "ldr q6, [x12, #0x40]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q7, [x12, #0x50]\n" + "add x20, x20, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x12, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q7, [x12, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x12, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q7, [x12, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr q6, [x12, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q7, [x12, #0xb0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q6, [x12, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q7, [x12, #0xd0]\n" + "ldr q6, [x12, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + "ldr q7, [x12, #0xf0]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "144:" // Height 6: Multiply loop: Main loop skip + "cbz x10, 149f\n" + "cmp x10, #0x4\n" + "blt 146f\n" + "145:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "bge 145b\n" + "cbz x10, 149f\n" + "146:" // Height 6: Multiply loop: Skip odd blocks + "tbz x10, #1, 147f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "tbz x10, #0, 148f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x20]\n" + "b 148f\n" + "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x20, #0x0]\n" + "148:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x12, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "149:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 139b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "ldr q0, [x15, #0x0]\n" + "ldr q1, [x15, #0x10]\n" + "add x23, x13, x19\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v9.4s, v9.4s, v1.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "prfm pstl1keep, [x19, #0x0]\n" + "ldr q2, [x15, #0x20]\n" + "ldr q3, [x15, #0x30]\n" + "add x15, x15, #0x40\n" + "add v10.4s, v10.4s, v2.4s\n" + "add v14.4s, v14.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "tbz %x[flags], #4, 150f\n" + "ldr q0, [x16, #0x0]\n" + "ldr q4, [x17, #0x0]\n" + "ldr q1, [x16, #0x10]\n" + "ldr q5, [x17, #0x10]\n" + "ldr q2, [x16, #0x20]\n" + "ldr q6, [x17, #0x20]\n" + "ldr q3, [x16, #0x30]\n" + "add x16, x16, #0x40\n" + "ldr q7, [x17, #0x30]\n" + "add x17, x17, #0x40\n" + "b 151f\n" + "150:" // Height 6: per layer parameters + "add x25, %x[qp], %[per_layer_right_shift]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v0.4s }, [x25]\n" + "ld1r { v4.4s }, [x24]\n" + "mov v1.16b, v0.16b\n" + "mov v2.16b, v0.16b\n" + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v3.16b, v0.16b\n" + "mov v7.16b, v4.16b\n" + "151:" // Height 6: parameters loaded + "sqrdmulh v8.4s, v8.4s, v4.4s\n" + "sqrdmulh v9.4s, v9.4s, v5.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sqrdmulh v11.4s, v11.4s, v7.4s\n" + "sqrdmulh v12.4s, v12.4s, v4.4s\n" + "sqrdmulh v13.4s, v13.4s, v5.4s\n" + "sqrdmulh v14.4s, v14.4s, v6.4s\n" + "sqrdmulh v15.4s, v15.4s, v7.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v5.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v19.4s, v19.4s, v7.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v5.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sqrdmulh v23.4s, v23.4s, v7.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v5.4s\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "sqrdmulh v27.4s, v27.4s, v7.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v5.4s\n" + "sqrdmulh v30.4s, v30.4s, v6.4s\n" + "sqrdmulh v31.4s, v31.4s, v7.4s\n" + "tbz %x[flags], #5, 152f\n" + "and v4.16b, v8.16b, v0.16b\n" + "and v5.16b, v9.16b, v1.16b\n" + "and v6.16b, v10.16b, v2.16b\n" + "and v7.16b, v11.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v4.4s\n" + "and v4.16b, v12.16b, v0.16b\n" + "sqadd v9.4s, v9.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v6.4s\n" + "sqadd v11.4s, v11.4s, v7.4s\n" + "and v5.16b, v13.16b, v1.16b\n" + "and v6.16b, v14.16b, v2.16b\n" + "and v7.16b, v15.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v4.4s\n" + "and v4.16b, v16.16b, v0.16b\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "and v5.16b, v17.16b, v1.16b\n" + "sqadd v14.4s, v14.4s, v6.4s\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "and v6.16b, v18.16b, v2.16b\n" + "and v7.16b, v19.16b, v3.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v20.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "and v5.16b, v21.16b, v1.16b\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v6.16b, v22.16b, v2.16b\n" + "and v7.16b, v23.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v4.16b, v24.16b, v0.16b\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "and v5.16b, v25.16b, v1.16b\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "and v6.16b, v26.16b, v2.16b\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "and v7.16b, v27.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "and v4.16b, v28.16b, v0.16b\n" + "sqadd v25.4s, v25.4s, v5.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v5.16b, v29.16b, v1.16b\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "and v6.16b, v30.16b, v2.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v7.4s\n" + "and v7.16b, v31.16b, v3.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v29.4s, v29.4s, v5.4s\n" + "sqadd v30.4s, v30.4s, v6.4s\n" + "sqadd v31.4s, v31.4s, v7.4s\n" + "152:" // Height 6: no shift correction + "srshl v8.4s, v8.4s, v0.4s\n" + "srshl v9.4s, v9.4s, v1.4s\n" + "srshl v10.4s, v10.4s, v2.4s\n" + "srshl v11.4s, v11.4s, v3.4s\n" + "srshl v12.4s, v12.4s, v0.4s\n" + "srshl v13.4s, v13.4s, v1.4s\n" + "srshl v14.4s, v14.4s, v2.4s\n" + "srshl v15.4s, v15.4s, v3.4s\n" + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v1.4s\n" + "srshl v18.4s, v18.4s, v2.4s\n" + "srshl v19.4s, v19.4s, v3.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v1.4s\n" + "srshl v22.4s, v22.4s, v2.4s\n" + "srshl v23.4s, v23.4s, v3.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v1.4s\n" + "srshl v26.4s, v26.4s, v2.4s\n" + "srshl v27.4s, v27.4s, v3.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "srshl v29.4s, v29.4s, v1.4s\n" + "srshl v30.4s, v30.4s, v2.4s\n" + "srshl v31.4s, v31.4s, v3.4s\n" + "add x24, %x[qp], %[c_offset]\n" + "add x25, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x25]\n" + "cmp x14, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v4.4s\n" + "add v10.4s, v10.4s, v4.4s\n" + "add v11.4s, v11.4s, v4.4s\n" + "add v12.4s, v12.4s, v4.4s\n" + "add v13.4s, v13.4s, v4.4s\n" + "add v14.4s, v14.4s, v4.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "smin v8.4s, v8.4s, v6.4s\n" + "smin v9.4s, v9.4s, v6.4s\n" + "smin v10.4s, v10.4s, v6.4s\n" + "smin v11.4s, v11.4s, v6.4s\n" + "smin v12.4s, v12.4s, v6.4s\n" + "smin v13.4s, v13.4s, v6.4s\n" + "smin v14.4s, v14.4s, v6.4s\n" + "smin v15.4s, v15.4s, v6.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smax v8.4s, v8.4s, v5.4s\n" + "smax v9.4s, v9.4s, v5.4s\n" + "smax v10.4s, v10.4s, v5.4s\n" + "smax v11.4s, v11.4s, v5.4s\n" + "smax v12.4s, v12.4s, v5.4s\n" + "smax v13.4s, v13.4s, v5.4s\n" + "smax v14.4s, v14.4s, v5.4s\n" + "smax v15.4s, v15.4s, v5.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "uzp1 v8.8h, v8.8h, v9.8h\n" + "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v12.8h, v12.8h, v13.8h\n" + "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v12.16b, v12.16b, v13.16b\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.8h, v28.8h, v29.8h\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 161f\n" + "tbz x14, #3, 156f\n" + "str d8, [x13], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x14, #2, 154f\n" + "st1 { v8.s }[2], [x13], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v28.s }[2], [x19], #0x4\n" + "tbz x14, #1, 153f\n" + "st1 { v8.h }[6], [x13], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v28.h }[6], [x19], #0x2\n" + "tbz x14, #0, 160f\n" + "st1 { v8.b }[14], [x13]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "st1 { v28.b }[14], [x19]\n" + "b 160f\n" + "153:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x14, #0, 160f\n" + "st1 { v8.b }[12], [x13]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "st1 { v28.b }[12], [x19]\n" + "b 160f\n" + "154:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x14, #1, 155f\n" + "st1 { v8.h }[4], [x13], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v28.h }[4], [x19], #0x2\n" + "tbz x14, #0, 160f\n" + "st1 { v8.b }[10], [x13]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "st1 { v28.b }[10], [x19]\n" + "b 160f\n" + "155:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x14, #0, 160f\n" + "st1 { v8.b }[8], [x13]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "st1 { v28.b }[8], [x19]\n" + "b 160f\n" + "156:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x14, #2, 158f\n" + "str s8, [x13], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "str s28, [x19], #0x4\n" + "tbz x14, #1, 157f\n" + "st1 { v8.h }[2], [x13], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v28.h }[2], [x19], #0x2\n" + "tbz x14, #0, 160f\n" + "st1 { v8.b }[6], [x13]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "st1 { v28.b }[6], [x19]\n" + "b 160f\n" + "157:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x14, #0, 160f\n" + "st1 { v8.b }[4], [x13]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "st1 { v28.b }[4], [x19]\n" + "b 160f\n" + "158:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x14, #1, 159f\n" + "str h8, [x13], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "str h28, [x19], #0x2\n" + "tbz x14, #0, 160f\n" + "st1 { v8.b }[2], [x13]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "st1 { v28.b }[2], [x19]\n" + "b 160f\n" + "159:" // Height 6: Partial direct writeback: partial_1_0 + "str b8, [x13, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "str b28, [x19, #0x0]\n" + "160:" // Height 6: Partial direct writeback: Done + "b 162f\n" + "161:" // Height 6: Full writeback + "str q8, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "str q28, [x19, #0x0]\n" + "162:" // Height 6: Writeback done + "subs x14, x14, #0x10\n" + "bgt 137b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 164f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 163f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "163:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "164:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp index 9847e6553b..5a4df161aa 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp @@ -88,210 +88,204 @@ void a64_hybrid_s8qs_dot_6x16 ( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 141f\n" + "bge 136f\n" "cmp %x[M], #0x4\n" - "bgt 113f\n" - "beq 85f\n" + "bgt 109f\n" + "beq 82f\n" "cmp %x[M], #0x2\n" - "bgt 57f\n" - "beq 29f\n" - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 55f\n" + "beq 28f\n" + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "2:" // Height 1: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" "movi v11.4s, #0x0\n" - "4:" // Height 1: setup done - "mov x12, #0x0\n" - "5:" // Height 1: String loop + "3:" // Height 1: setup done + "mov x27, #0x0\n" + "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 6f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 7f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 6f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "b 7f\n" - "6:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "7:" // Height 1: input setup done - "cmp x11, #0x10\n" - "blt 10f\n" - "cmp x11, #0x20\n" + "add x25, x25, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x26, #0x10\n" "blt 9f\n" - "8:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q6, [x28, #0x0]\n" + "cmp x26, #0x20\n" + "blt 8f\n" + "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x28, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x28, #0x30]\n" + "cmp x26, #0x20\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q6, [x28, #0x40]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x60]\n" - "sub x11, x11, #0x10\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x70]\n" - "cmp x11, #0x20\n" + "ldr q7, [x28, #0x70]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - "bge 8b\n" - "9:" // Height 1: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "bge 7b\n" + "8:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x28, #0x30]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - "10:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 15f\n" - "cmp x11, #0x4\n" - "blt 12f\n" - "11:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "9:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 14f\n" + "cmp x26, #0x4\n" + "blt 11f\n" + "10:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x28, #0x10]\n" + "cmp x26, #0x4\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x4\n" - "add x14, x14, #0x40\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "cmp x11, #0x4\n" - "bge 11b\n" - "cbz x11, 15f\n" - "12:" // Height 1: Multiply loop: Skip odd blocks - "tbz x11, #1, 13f\n" - "ldr h0, [x10], #0x2\n" - "tbz x11, #0, 14f\n" - "ld1 { v0.b }[2], [x10]\n" - "b 14f\n" - "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "14:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 10b\n" + "cbz x26, 14f\n" + "11:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #1, 12f\n" + "ldr h0, [x25], #0x2\n" + "tbz x26, #0, 13f\n" + "ld1 { v0.b }[2], [x25]\n" + "b 13f\n" + "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "13:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x28, #0x10]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "15:" // Height 1: Multiply loop: No odd multiplies + "14:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 5b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "ldr q0, [x16, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 4b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "ldr q0, [x11, #0x0]\n" "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x16, #0x10]\n" - "ldr q2, [x16, #0x20]\n" + "ldr q1, [x11, #0x10]\n" + "ldr q2, [x11, #0x20]\n" "add v9.4s, v9.4s, v1.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" "add v10.4s, v10.4s, v2.4s\n" - "add x16, x16, #0x40\n" "add v11.4s, v11.4s, v3.4s\n" - "tbz %x[flags], #4, 16f\n" - "ldr q0, [x17, #0x0]\n" - "ldr q4, [x8, #0x0]\n" - "ldr q1, [x17, #0x10]\n" - "ldr q5, [x8, #0x10]\n" - "ldr q2, [x17, #0x20]\n" - "ldr q6, [x8, #0x20]\n" - "ldr q3, [x17, #0x30]\n" - "ldr q7, [x8, #0x30]\n" - "add x17, x17, #0x40\n" - "add x8, x8, #0x40\n" - "b 17f\n" - "16:" // Height 1: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x19]\n" + "tbz %x[flags], #4, 15f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 16f\n" + "15:" // Height 1: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" "mov v2.16b, v0.16b\n" "mov v3.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v6.16b, v4.16b\n" "mov v7.16b, v4.16b\n" - "17:" // Height 1: parameters loaded + "16:" // Height 1: parameters loaded "sqrdmulh v8.4s, v8.4s, v4.4s\n" "sqrdmulh v9.4s, v9.4s, v5.4s\n" "sqrdmulh v10.4s, v10.4s, v6.4s\n" "sqrdmulh v11.4s, v11.4s, v7.4s\n" - "tbz %x[flags], #5, 18f\n" + "tbz %x[flags], #5, 17f\n" "and v4.16b, v8.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" @@ -304,18 +298,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqadd v9.4s, v9.4s, v5.4s\n" "sqadd v10.4s, v10.4s, v6.4s\n" "sqadd v11.4s, v11.4s, v7.4s\n" - "18:" // Height 1: no shift correction + "17:" // Height 1: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" "srshl v10.4s, v10.4s, v2.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" "srshl v11.4s, v11.4s, v3.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x15, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" "add v8.4s, v8.4s, v4.4s\n" "add v9.4s, v9.4s, v4.4s\n" "add v10.4s, v10.4s, v4.4s\n" @@ -331,76 +325,67 @@ void a64_hybrid_s8qs_dot_6x16 ( "smax v11.4s, v11.4s, v5.4s\n" "uzp1 v9.8h, v10.8h, v11.8h\n" "uzp1 v8.16b, v8.16b, v9.16b\n" - "bge 27f\n" - "tbz x15, #3, 22f\n" - "str d8, [x13], #0x8\n" - "tbz x15, #2, 20f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "tbz x15, #1, 19f\n" - "st1 { v8.h }[6], [x13], #0x2\n" - "tbz x15, #0, 26f\n" - "st1 { v8.b }[14], [x13]\n" - "b 26f\n" - "19:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 26f\n" - "st1 { v8.b }[12], [x13]\n" - "b 26f\n" - "20:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 21f\n" - "st1 { v8.h }[4], [x13], #0x2\n" - "tbz x15, #0, 26f\n" - "st1 { v8.b }[10], [x13]\n" - "b 26f\n" - "21:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 26f\n" - "st1 { v8.b }[8], [x13]\n" - "b 26f\n" - "22:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 24f\n" - "str s8, [x13], #0x4\n" - "tbz x15, #1, 23f\n" - "st1 { v8.h }[2], [x13], #0x2\n" - "tbz x15, #0, 26f\n" - "st1 { v8.b }[6], [x13]\n" - "b 26f\n" - "23:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 26f\n" - "st1 { v8.b }[4], [x13]\n" - "b 26f\n" - "24:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 25f\n" - "str h8, [x13], #0x2\n" - "tbz x15, #0, 26f\n" - "st1 { v8.b }[2], [x13]\n" - "b 26f\n" - "25:" // Height 1: Partial direct writeback: partial_1_0 - "str b8, [x13, #0x0]\n" - "26:" // Height 1: Partial direct writeback: Done - "b 28f\n" - "27:" // Height 1: Full writeback - "str q8, [x13, #0x0]\n" - "add x13, x13, #0x10\n" - "28:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" - "bgt 3b\n" - "b 170f\n" - "29:" // Height 2 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 30f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "b 31f\n" - "30:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "31:" // Height 2: Column loop + "bge 26f\n" + "tbz x10, #3, 21f\n" + "str d8, [x9], #0x8\n" + "tbz x10, #2, 19f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "tbz x10, #1, 18f\n" + "st1 { v8.h }[6], [x9], #0x2\n" + "tbz x10, #0, 25f\n" + "st1 { v8.b }[14], [x9]\n" + "b 25f\n" + "18:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 25f\n" + "st1 { v8.b }[12], [x9]\n" + "b 25f\n" + "19:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 20f\n" + "st1 { v8.h }[4], [x9], #0x2\n" + "tbz x10, #0, 25f\n" + "st1 { v8.b }[10], [x9]\n" + "b 25f\n" + "20:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 25f\n" + "st1 { v8.b }[8], [x9]\n" + "b 25f\n" + "21:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 23f\n" + "str s8, [x9], #0x4\n" + "tbz x10, #1, 22f\n" + "st1 { v8.h }[2], [x9], #0x2\n" + "tbz x10, #0, 25f\n" + "st1 { v8.b }[6], [x9]\n" + "b 25f\n" + "22:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 25f\n" + "st1 { v8.b }[4], [x9]\n" + "b 25f\n" + "23:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 24f\n" + "str h8, [x9], #0x2\n" + "tbz x10, #0, 25f\n" + "st1 { v8.b }[2], [x9]\n" + "b 25f\n" + "24:" // Height 1: Partial direct writeback: partial_1_0 + "str b8, [x9, #0x0]\n" + "25:" // Height 1: Partial direct writeback: Done + "b 27f\n" + "26:" // Height 1: Full writeback + "str q8, [x9, #0x0]\n" + "add x9, x9, #0x10\n" + "27:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" + "bgt 2b\n" + "b 164f\n" + "28:" // Height 2 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "29:" // Height 2: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -409,239 +394,241 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "32:" // Height 2: setup done - "mov x12, #0x0\n" - "33:" // Height 2: String loop + "30:" // Height 2: setup done + "mov x27, #0x0\n" + "31:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 34f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 32f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 35f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 33f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "b 35f\n" - "34:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "35:" // Height 2: input setup done - "cmp x11, #0x10\n" - "blt 38f\n" - "cmp x11, #0x20\n" - "blt 37f\n" - "36:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 33f\n" + "32:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "33:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 36f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q6, [x28, #0x0]\n" + "blt 35f\n" + "34:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x28, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "cmp x26, #0x20\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "cmp x11, #0x20\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "bge 36b\n" - "37:" // Height 2: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "bge 34b\n" + "35:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x28, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "38:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 43f\n" - "cmp x11, #0x4\n" - "blt 40f\n" - "39:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "36:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 41f\n" + "cmp x26, #0x4\n" + "blt 38f\n" + "37:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "sub x11, x11, #0x4\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "bge 39b\n" - "cbz x11, 43f\n" - "40:" // Height 2: Multiply loop: Skip odd blocks - "tbz x11, #1, 41f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "tbz x11, #0, 42f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "b 42f\n" - "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "42:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 37b\n" + "cbz x26, 41f\n" + "38:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #1, 39f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "tbz x26, #0, 40f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "b 40f\n" + "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "40:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "43:" // Height 2: Multiply loop: No odd multiplies + "41:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 33b\n" - "prfm pstl1keep, [x13, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 31b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "ldr q0, [x16, #0x0]\n" + "add x23, x9, x19\n" + "ldr q0, [x11, #0x0]\n" "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x16, #0x10]\n" + "prfm pstl1keep, [x23, #0x0]\n" "add v12.4s, v12.4s, v0.4s\n" - "ldr q2, [x16, #0x20]\n" - "ldr q3, [x16, #0x30]\n" + "ldr q1, [x11, #0x10]\n" + "ldr q2, [x11, #0x20]\n" "add v9.4s, v9.4s, v1.4s\n" - "add x16, x16, #0x40\n" - "add v13.4s, v13.4s, v1.4s\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add v14.4s, v14.4s, v2.4s\n" + "add v11.4s, v11.4s, v3.4s\n" "add v15.4s, v15.4s, v3.4s\n" - "tbz %x[flags], #4, 44f\n" - "ldr q0, [x17, #0x0]\n" - "ldr q4, [x8, #0x0]\n" - "ldr q1, [x17, #0x10]\n" - "ldr q5, [x8, #0x10]\n" - "ldr q2, [x17, #0x20]\n" - "ldr q6, [x8, #0x20]\n" - "ldr q3, [x17, #0x30]\n" - "ldr q7, [x8, #0x30]\n" - "add x17, x17, #0x40\n" - "add x8, x8, #0x40\n" - "b 45f\n" - "44:" // Height 2: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x19]\n" + "tbz %x[flags], #4, 42f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 43f\n" + "42:" // Height 2: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" "mov v2.16b, v0.16b\n" "mov v3.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v6.16b, v4.16b\n" "mov v7.16b, v4.16b\n" - "45:" // Height 2: parameters loaded + "43:" // Height 2: parameters loaded "sqrdmulh v8.4s, v8.4s, v4.4s\n" "sqrdmulh v9.4s, v9.4s, v5.4s\n" "sqrdmulh v10.4s, v10.4s, v6.4s\n" @@ -650,7 +637,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v13.4s, v13.4s, v5.4s\n" "sqrdmulh v14.4s, v14.4s, v6.4s\n" "sqrdmulh v15.4s, v15.4s, v7.4s\n" - "tbz %x[flags], #5, 46f\n" + "tbz %x[flags], #5, 44f\n" "and v4.16b, v8.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" @@ -675,18 +662,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqadd v13.4s, v13.4s, v5.4s\n" "sqadd v14.4s, v14.4s, v6.4s\n" "sqadd v15.4s, v15.4s, v7.4s\n" - "46:" // Height 2: no shift correction + "44:" // Height 2: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" "srshl v10.4s, v10.4s, v2.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" "srshl v11.4s, v11.4s, v3.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x15, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" @@ -721,96 +708,83 @@ void a64_hybrid_s8qs_dot_6x16 ( "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v13.8h, v14.8h, v15.8h\n" "uzp1 v12.16b, v12.16b, v13.16b\n" - "bge 55f\n" - "tbz x15, #3, 50f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "tbz x15, #2, 48f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "tbz x15, #1, 47f\n" - "st1 { v8.h }[6], [x13], #0x2\n" - "st1 { v12.h }[6], [x9], #0x2\n" - "tbz x15, #0, 54f\n" - "st1 { v8.b }[14], [x13]\n" - "st1 { v12.b }[14], [x9]\n" - "b 54f\n" - "47:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 54f\n" - "st1 { v8.b }[12], [x13]\n" - "st1 { v12.b }[12], [x9]\n" - "b 54f\n" - "48:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 49f\n" - "st1 { v8.h }[4], [x13], #0x2\n" - "st1 { v12.h }[4], [x9], #0x2\n" - "tbz x15, #0, 54f\n" - "st1 { v8.b }[10], [x13]\n" - "st1 { v12.b }[10], [x9]\n" - "b 54f\n" - "49:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 54f\n" - "st1 { v8.b }[8], [x13]\n" - "st1 { v12.b }[8], [x9]\n" - "b 54f\n" - "50:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 52f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "tbz x15, #1, 51f\n" - "st1 { v8.h }[2], [x13], #0x2\n" - "st1 { v12.h }[2], [x9], #0x2\n" - "tbz x15, #0, 54f\n" - "st1 { v8.b }[6], [x13]\n" - "st1 { v12.b }[6], [x9]\n" - "b 54f\n" - "51:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 54f\n" - "st1 { v8.b }[4], [x13]\n" - "st1 { v12.b }[4], [x9]\n" + "bge 53f\n" + "tbz x10, #3, 48f\n" + "str d8, [x9], #0x8\n" + "str d12, [x23], #0x8\n" + "tbz x10, #2, 46f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "tbz x10, #1, 45f\n" + "st1 { v8.h }[6], [x9], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "tbz x10, #0, 52f\n" + "st1 { v8.b }[14], [x9]\n" + "st1 { v12.b }[14], [x23]\n" + "b 52f\n" + "45:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 52f\n" + "st1 { v8.b }[12], [x9]\n" + "st1 { v12.b }[12], [x23]\n" + "b 52f\n" + "46:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 47f\n" + "st1 { v8.h }[4], [x9], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "tbz x10, #0, 52f\n" + "st1 { v8.b }[10], [x9]\n" + "st1 { v12.b }[10], [x23]\n" + "b 52f\n" + "47:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 52f\n" + "st1 { v8.b }[8], [x9]\n" + "st1 { v12.b }[8], [x23]\n" + "b 52f\n" + "48:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 50f\n" + "str s8, [x9], #0x4\n" + "str s12, [x23], #0x4\n" + "tbz x10, #1, 49f\n" + "st1 { v8.h }[2], [x9], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "tbz x10, #0, 52f\n" + "st1 { v8.b }[6], [x9]\n" + "st1 { v12.b }[6], [x23]\n" + "b 52f\n" + "49:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 52f\n" + "st1 { v8.b }[4], [x9]\n" + "st1 { v12.b }[4], [x23]\n" + "b 52f\n" + "50:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 51f\n" + "str h8, [x9], #0x2\n" + "str h12, [x23], #0x2\n" + "tbz x10, #0, 52f\n" + "st1 { v8.b }[2], [x9]\n" + "st1 { v12.b }[2], [x23]\n" + "b 52f\n" + "51:" // Height 2: Partial direct writeback: partial_1_0 + "str b8, [x9, #0x0]\n" + "str b12, [x23, #0x0]\n" + "52:" // Height 2: Partial direct writeback: Done "b 54f\n" - "52:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 53f\n" - "str h8, [x13], #0x2\n" - "str h12, [x9], #0x2\n" - "tbz x15, #0, 54f\n" - "st1 { v8.b }[2], [x13]\n" - "st1 { v12.b }[2], [x9]\n" - "b 54f\n" - "53:" // Height 2: Partial direct writeback: partial_1_0 - "str b8, [x13, #0x0]\n" - "str b12, [x9, #0x0]\n" - "54:" // Height 2: Partial direct writeback: Done - "b 56f\n" - "55:" // Height 2: Full writeback - "str q8, [x13, #0x0]\n" - "str q12, [x9, #0x0]\n" - "add x13, x13, #0x10\n" + "53:" // Height 2: Full writeback + "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "56:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" - "bgt 31b\n" - "b 170f\n" - "57:" // Height 3 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 58f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "add x27, x27, x19\n" - "b 59f\n" - "58:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "59:" // Height 3: Column loop + "str q12, [x23, #0x0]\n" + "54:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 29b\n" + "b 164f\n" + "55:" // Height 3 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "56:" // Height 3: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -823,297 +797,300 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "60:" // Height 3: setup done - "mov x12, #0x0\n" - "61:" // Height 3: String loop + "57:" // Height 3: setup done + "mov x27, #0x0\n" + "58:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 62f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 63f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 60f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" - "b 63f\n" - "62:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "63:" // Height 3: input setup done - "cmp x11, #0x10\n" - "blt 66f\n" - "cmp x11, #0x20\n" - "blt 65f\n" - "64:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 60f\n" + "59:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "60:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 63f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q6, [x28, #0x0]\n" + "blt 62f\n" + "61:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "bge 64b\n" - "65:" // Height 3: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "bge 61b\n" + "62:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "66:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 71f\n" - "cmp x11, #0x4\n" - "blt 68f\n" - "67:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "63:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 68f\n" + "cmp x26, #0x4\n" + "blt 65f\n" + "64:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "cmp x11, #0x4\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "bge 67b\n" - "cbz x11, 71f\n" - "68:" // Height 3: Multiply loop: Skip odd blocks - "tbz x11, #1, 69f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "tbz x11, #0, 70f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "b 70f\n" - "69:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "70:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 64b\n" + "cbz x26, 68f\n" + "65:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #1, 66f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "tbz x26, #0, 67f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "b 67f\n" + "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "67:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "71:" // Height 3: Multiply loop: No odd multiplies + "68:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 61b\n" - "prfm pstl1keep, [x13, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "ldr q0, [x16, #0x0]\n" + "add x23, x9, x19\n" + "ldr q0, [x11, #0x0]\n" "add v8.4s, v8.4s, v0.4s\n" - "prfm pstl1keep, [x27, #0x0]\n" - "ldr q1, [x16, #0x10]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" "add v12.4s, v12.4s, v0.4s\n" - "ldr q2, [x16, #0x20]\n" + "prfm pstl1keep, [x22, #0x0]\n" "add v16.4s, v16.4s, v0.4s\n" - "ldr q3, [x16, #0x30]\n" - "add x16, x16, #0x40\n" + "ldr q1, [x11, #0x10]\n" + "ldr q2, [x11, #0x20]\n" "add v9.4s, v9.4s, v1.4s\n" - "add v13.4s, v13.4s, v1.4s\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" "add v17.4s, v17.4s, v1.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" "add v18.4s, v18.4s, v2.4s\n" "add v19.4s, v19.4s, v3.4s\n" - "tbz %x[flags], #4, 72f\n" - "ldr q0, [x17, #0x0]\n" - "ldr q4, [x8, #0x0]\n" - "ldr q1, [x17, #0x10]\n" - "ldr q5, [x8, #0x10]\n" - "ldr q2, [x17, #0x20]\n" - "ldr q6, [x8, #0x20]\n" - "ldr q3, [x17, #0x30]\n" - "ldr q7, [x8, #0x30]\n" - "add x17, x17, #0x40\n" - "add x8, x8, #0x40\n" - "b 73f\n" - "72:" // Height 3: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x19]\n" + "tbz %x[flags], #4, 69f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 70f\n" + "69:" // Height 3: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" "mov v2.16b, v0.16b\n" "mov v3.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v6.16b, v4.16b\n" "mov v7.16b, v4.16b\n" - "73:" // Height 3: parameters loaded + "70:" // Height 3: parameters loaded "sqrdmulh v8.4s, v8.4s, v4.4s\n" "sqrdmulh v9.4s, v9.4s, v5.4s\n" "sqrdmulh v10.4s, v10.4s, v6.4s\n" @@ -1126,7 +1103,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v17.4s, v17.4s, v5.4s\n" "sqrdmulh v18.4s, v18.4s, v6.4s\n" "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "tbz %x[flags], #5, 74f\n" + "tbz %x[flags], #5, 71f\n" "and v4.16b, v8.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" @@ -1163,18 +1140,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqadd v17.4s, v17.4s, v5.4s\n" "sqadd v18.4s, v18.4s, v6.4s\n" "sqadd v19.4s, v19.4s, v7.4s\n" - "74:" // Height 3: no shift correction + "71:" // Height 3: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" "srshl v10.4s, v10.4s, v2.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" "srshl v11.4s, v11.4s, v3.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x15, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" @@ -1228,116 +1205,99 @@ void a64_hybrid_s8qs_dot_6x16 ( "uzp1 v8.16b, v8.16b, v9.16b\n" "uzp1 v12.16b, v12.16b, v13.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 83f\n" - "tbz x15, #3, 78f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "tbz x15, #2, 76f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "tbz x15, #1, 75f\n" - "st1 { v8.h }[6], [x13], #0x2\n" - "st1 { v12.h }[6], [x9], #0x2\n" - "st1 { v16.h }[6], [x27], #0x2\n" - "tbz x15, #0, 82f\n" - "st1 { v8.b }[14], [x13]\n" - "st1 { v12.b }[14], [x9]\n" - "st1 { v16.b }[14], [x27]\n" - "b 82f\n" - "75:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 82f\n" - "st1 { v8.b }[12], [x13]\n" - "st1 { v12.b }[12], [x9]\n" - "st1 { v16.b }[12], [x27]\n" - "b 82f\n" - "76:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 77f\n" - "st1 { v8.h }[4], [x13], #0x2\n" - "st1 { v12.h }[4], [x9], #0x2\n" - "st1 { v16.h }[4], [x27], #0x2\n" - "tbz x15, #0, 82f\n" - "st1 { v8.b }[10], [x13]\n" - "st1 { v12.b }[10], [x9]\n" - "st1 { v16.b }[10], [x27]\n" - "b 82f\n" - "77:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 82f\n" - "st1 { v8.b }[8], [x13]\n" - "st1 { v12.b }[8], [x9]\n" - "st1 { v16.b }[8], [x27]\n" - "b 82f\n" - "78:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 80f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "tbz x15, #1, 79f\n" - "st1 { v8.h }[2], [x13], #0x2\n" - "st1 { v12.h }[2], [x9], #0x2\n" - "st1 { v16.h }[2], [x27], #0x2\n" - "tbz x15, #0, 82f\n" - "st1 { v8.b }[6], [x13]\n" - "st1 { v12.b }[6], [x9]\n" - "st1 { v16.b }[6], [x27]\n" - "b 82f\n" - "79:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 82f\n" - "st1 { v8.b }[4], [x13]\n" - "st1 { v12.b }[4], [x9]\n" - "st1 { v16.b }[4], [x27]\n" - "b 82f\n" - "80:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 81f\n" - "str h8, [x13], #0x2\n" - "str h12, [x9], #0x2\n" - "str h16, [x27], #0x2\n" - "tbz x15, #0, 82f\n" - "st1 { v8.b }[2], [x13]\n" - "st1 { v12.b }[2], [x9]\n" - "st1 { v16.b }[2], [x27]\n" - "b 82f\n" - "81:" // Height 3: Partial direct writeback: partial_1_0 - "str b8, [x13, #0x0]\n" - "str b12, [x9, #0x0]\n" - "str b16, [x27, #0x0]\n" - "82:" // Height 3: Partial direct writeback: Done - "b 84f\n" - "83:" // Height 3: Full writeback - "str q8, [x13, #0x0]\n" - "str q12, [x9, #0x0]\n" - "str q16, [x27, #0x0]\n" - "add x13, x13, #0x10\n" + "bge 80f\n" + "tbz x10, #3, 75f\n" + "str d8, [x9], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x10, #2, 73f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "tbz x10, #1, 72f\n" + "st1 { v8.h }[6], [x9], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "tbz x10, #0, 79f\n" + "st1 { v8.b }[14], [x9]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "b 79f\n" + "72:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 79f\n" + "st1 { v8.b }[12], [x9]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "b 79f\n" + "73:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 74f\n" + "st1 { v8.h }[4], [x9], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "tbz x10, #0, 79f\n" + "st1 { v8.b }[10], [x9]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "b 79f\n" + "74:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 79f\n" + "st1 { v8.b }[8], [x9]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "b 79f\n" + "75:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 77f\n" + "str s8, [x9], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "tbz x10, #1, 76f\n" + "st1 { v8.h }[2], [x9], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "tbz x10, #0, 79f\n" + "st1 { v8.b }[6], [x9]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "b 79f\n" + "76:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 79f\n" + "st1 { v8.b }[4], [x9]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "b 79f\n" + "77:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 78f\n" + "str h8, [x9], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "tbz x10, #0, 79f\n" + "st1 { v8.b }[2], [x9]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "b 79f\n" + "78:" // Height 3: Partial direct writeback: partial_1_0 + "str b8, [x9, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "79:" // Height 3: Partial direct writeback: Done + "b 81f\n" + "80:" // Height 3: Full writeback + "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "84:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" - "bgt 59b\n" - "b 170f\n" - "85:" // Height 4 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 86f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19\n" - "add x25, x25, x19\n" - "b 87f\n" - "86:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "add x25, x27, x19\n" - "87:" // Height 4: Column loop + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "81:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 56b\n" + "b 164f\n" + "82:" // Height 4 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "83:" // Height 4: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1354,220 +1314,220 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "88:" // Height 4: setup done - "mov x12, #0x0\n" - "89:" // Height 4: String loop + "84:" // Height 4: setup done + "mov x27, #0x0\n" + "85:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 90f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 86f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 91f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 87f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" - "b 91f\n" - "90:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "91:" // Height 4: input setup done - "cmp x11, #0x10\n" - "blt 94f\n" - "cmp x11, #0x20\n" - "blt 93f\n" - "92:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 87f\n" + "86:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "87:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 90f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q6, [x28, #0x0]\n" + "blt 89f\n" + "88:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x11, x11, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "bge 92b\n" - "93:" // Height 4: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "bge 88b\n" + "89:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" @@ -1576,31 +1536,31 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "94:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 99f\n" - "cmp x11, #0x4\n" - "blt 96f\n" - "95:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "90:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 95f\n" + "cmp x26, #0x4\n" + "blt 92f\n" + "91:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -1608,40 +1568,40 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "bge 95b\n" - "cbz x11, 99f\n" - "96:" // Height 4: Multiply loop: Skip odd blocks - "tbz x11, #1, 97f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "tbz x11, #0, 98f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "b 98f\n" - "97:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "98:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 91b\n" + "cbz x26, 95f\n" + "92:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #1, 93f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "tbz x26, #0, 94f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "b 94f\n" + "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "94:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -1649,60 +1609,64 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "99:" // Height 4: Multiply loop: No odd multiplies + "95:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 89b\n" - "prfm pstl1keep, [x13, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 85b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "ldr q0, [x16, #0x0]\n" + "add x23, x9, x19\n" + "ldr q0, [x11, #0x0]\n" "add v8.4s, v8.4s, v0.4s\n" - "prfm pstl1keep, [x27, #0x0]\n" - "ldr q1, [x16, #0x10]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x25, #0x0]\n" - "ldr q2, [x16, #0x20]\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" "add v16.4s, v16.4s, v0.4s\n" - "ldr q3, [x16, #0x30]\n" + "prfm pstl1keep, [x21, #0x0]\n" "add v20.4s, v20.4s, v0.4s\n" - "add x16, x16, #0x40\n" + "ldr q1, [x11, #0x10]\n" + "ldr q2, [x11, #0x20]\n" "add v9.4s, v9.4s, v1.4s\n" - "add v13.4s, v13.4s, v1.4s\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" "add v17.4s, v17.4s, v1.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" "add v18.4s, v18.4s, v2.4s\n" "add v19.4s, v19.4s, v3.4s\n" "add v21.4s, v21.4s, v1.4s\n" "add v22.4s, v22.4s, v2.4s\n" "add v23.4s, v23.4s, v3.4s\n" - "tbz %x[flags], #4, 100f\n" - "ldr q0, [x17, #0x0]\n" - "ldr q4, [x8, #0x0]\n" - "ldr q1, [x17, #0x10]\n" - "ldr q5, [x8, #0x10]\n" - "ldr q2, [x17, #0x20]\n" - "ldr q6, [x8, #0x20]\n" - "ldr q3, [x17, #0x30]\n" - "ldr q7, [x8, #0x30]\n" - "add x17, x17, #0x40\n" - "add x8, x8, #0x40\n" - "b 101f\n" - "100:" // Height 4: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x19]\n" + "tbz %x[flags], #4, 96f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 97f\n" + "96:" // Height 4: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" "mov v2.16b, v0.16b\n" "mov v3.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v6.16b, v4.16b\n" "mov v7.16b, v4.16b\n" - "101:" // Height 4: parameters loaded + "97:" // Height 4: parameters loaded "sqrdmulh v8.4s, v8.4s, v4.4s\n" "sqrdmulh v9.4s, v9.4s, v5.4s\n" "sqrdmulh v10.4s, v10.4s, v6.4s\n" @@ -1719,7 +1683,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v21.4s, v21.4s, v5.4s\n" "sqrdmulh v22.4s, v22.4s, v6.4s\n" "sqrdmulh v23.4s, v23.4s, v7.4s\n" - "tbz %x[flags], #5, 102f\n" + "tbz %x[flags], #5, 98f\n" "and v4.16b, v8.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" @@ -1768,18 +1732,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqadd v21.4s, v21.4s, v5.4s\n" "sqadd v22.4s, v22.4s, v6.4s\n" "sqadd v23.4s, v23.4s, v7.4s\n" - "102:" // Height 4: no shift correction + "98:" // Height 4: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" "srshl v10.4s, v10.4s, v2.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" "srshl v11.4s, v11.4s, v3.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x15, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" @@ -1852,136 +1816,115 @@ void a64_hybrid_s8qs_dot_6x16 ( "uzp1 v12.16b, v12.16b, v13.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v20.16b, v20.16b, v21.16b\n" - "bge 111f\n" - "tbz x15, #3, 106f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x15, #2, 104f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "tbz x15, #1, 103f\n" - "st1 { v8.h }[6], [x13], #0x2\n" - "st1 { v12.h }[6], [x9], #0x2\n" - "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "tbz x15, #0, 110f\n" - "st1 { v8.b }[14], [x13]\n" - "st1 { v12.b }[14], [x9]\n" - "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x25]\n" - "b 110f\n" - "103:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 110f\n" - "st1 { v8.b }[12], [x13]\n" - "st1 { v12.b }[12], [x9]\n" - "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x25]\n" - "b 110f\n" - "104:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 105f\n" - "st1 { v8.h }[4], [x13], #0x2\n" - "st1 { v12.h }[4], [x9], #0x2\n" - "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "tbz x15, #0, 110f\n" - "st1 { v8.b }[10], [x13]\n" - "st1 { v12.b }[10], [x9]\n" - "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x25]\n" - "b 110f\n" - "105:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 110f\n" - "st1 { v8.b }[8], [x13]\n" - "st1 { v12.b }[8], [x9]\n" - "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x25]\n" - "b 110f\n" - "106:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 108f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "str s20, [x25], #0x4\n" - "tbz x15, #1, 107f\n" - "st1 { v8.h }[2], [x13], #0x2\n" - "st1 { v12.h }[2], [x9], #0x2\n" - "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "tbz x15, #0, 110f\n" - "st1 { v8.b }[6], [x13]\n" - "st1 { v12.b }[6], [x9]\n" - "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x25]\n" - "b 110f\n" - "107:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 110f\n" - "st1 { v8.b }[4], [x13]\n" - "st1 { v12.b }[4], [x9]\n" - "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x25]\n" - "b 110f\n" - "108:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 109f\n" - "str h8, [x13], #0x2\n" - "str h12, [x9], #0x2\n" - "str h16, [x27], #0x2\n" - "str h20, [x25], #0x2\n" - "tbz x15, #0, 110f\n" - "st1 { v8.b }[2], [x13]\n" - "st1 { v12.b }[2], [x9]\n" - "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x25]\n" - "b 110f\n" - "109:" // Height 4: Partial direct writeback: partial_1_0 - "str b8, [x13, #0x0]\n" - "str b12, [x9, #0x0]\n" - "str b16, [x27, #0x0]\n" - "str b20, [x25, #0x0]\n" - "110:" // Height 4: Partial direct writeback: Done - "b 112f\n" - "111:" // Height 4: Full writeback - "str q8, [x13, #0x0]\n" - "str q12, [x9, #0x0]\n" - "str q16, [x27, #0x0]\n" - "str q20, [x25, #0x0]\n" - "add x13, x13, #0x10\n" + "bge 107f\n" + "tbz x10, #3, 102f\n" + "str d8, [x9], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x10, #2, 100f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz x10, #1, 99f\n" + "st1 { v8.h }[6], [x9], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "tbz x10, #0, 106f\n" + "st1 { v8.b }[14], [x9]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "st1 { v20.b }[14], [x21]\n" + "b 106f\n" + "99:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 106f\n" + "st1 { v8.b }[12], [x9]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "st1 { v20.b }[12], [x21]\n" + "b 106f\n" + "100:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 101f\n" + "st1 { v8.h }[4], [x9], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "tbz x10, #0, 106f\n" + "st1 { v8.b }[10], [x9]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "st1 { v20.b }[10], [x21]\n" + "b 106f\n" + "101:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 106f\n" + "st1 { v8.b }[8], [x9]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "st1 { v20.b }[8], [x21]\n" + "b 106f\n" + "102:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 104f\n" + "str s8, [x9], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz x10, #1, 103f\n" + "st1 { v8.h }[2], [x9], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "tbz x10, #0, 106f\n" + "st1 { v8.b }[6], [x9]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "st1 { v20.b }[6], [x21]\n" + "b 106f\n" + "103:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 106f\n" + "st1 { v8.b }[4], [x9]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "st1 { v20.b }[4], [x21]\n" + "b 106f\n" + "104:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 105f\n" + "str h8, [x9], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "str h20, [x21], #0x2\n" + "tbz x10, #0, 106f\n" + "st1 { v8.b }[2], [x9]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "st1 { v20.b }[2], [x21]\n" + "b 106f\n" + "105:" // Height 4: Partial direct writeback: partial_1_0 + "str b8, [x9, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "str b20, [x21, #0x0]\n" + "106:" // Height 4: Partial direct writeback: Done + "b 108f\n" + "107:" // Height 4: Full writeback + "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x25, x25, #0x10\n" - "112:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" - "bgt 87b\n" - "b 170f\n" - "113:" // Height 5 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 114f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "b 115f\n" - "114:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "add x25, x27, x19\n" - "add x23, x25, x19\n" - "115:" // Height 5: Column loop + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "str q20, [x21, #0x0]\n" + "108:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 83b\n" + "b 164f\n" + "109:" // Height 5 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "110:" // Height 5: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2002,260 +1945,260 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "116:" // Height 5: setup done - "mov x12, #0x0\n" - "117:" // Height 5: String loop + "111:" // Height 5: setup done + "mov x27, #0x0\n" + "112:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 118f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 113f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 119f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 114f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 119f\n" - "118:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "119:" // Height 5: input setup done - "cmp x11, #0x10\n" - "blt 122f\n" - "cmp x11, #0x20\n" - "blt 121f\n" - "120:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x21, x21, x19\n" + "b 114f\n" + "113:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "114:" // Height 5: input setup done + "cmp x26, #0x10\n" + "blt 117f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "ldr q6, [x28, #0x0]\n" + "blt 116f\n" + "115:" // Height 5: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x11, x11, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "bge 120b\n" - "121:" // Height 5: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "bge 115b\n" + "116:" // Height 5: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x28, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" @@ -2265,34 +2208,34 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "122:" // Height 5: Multiply loop: Main loop skip - "cbz x11, 127f\n" - "cmp x11, #0x4\n" - "blt 124f\n" - "123:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "117:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 122f\n" + "cmp x26, #0x4\n" + "blt 119f\n" + "118:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -2302,45 +2245,45 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "bge 123b\n" - "cbz x11, 127f\n" - "124:" // Height 5: Multiply loop: Skip odd blocks - "tbz x11, #1, 125f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" - "tbz x11, #0, 126f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "ld1 { v4.b }[2], [x22]\n" - "b 126f\n" - "125:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "ldr b4, [x22, #0x0]\n" - "126:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 118b\n" + "cbz x26, 122f\n" + "119:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #1, 120f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" + "tbz x26, #0, 121f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "ld1 { v4.b }[2], [x21]\n" + "b 121f\n" + "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "ldr b4, [x21, #0x0]\n" + "121:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -2350,33 +2293,38 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "127:" // Height 5: Multiply loop: No odd multiplies + "122:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 117b\n" - "prfm pstl1keep, [x13, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 112b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "ldr q0, [x16, #0x0]\n" + "add x23, x9, x19\n" + "ldr q0, [x11, #0x0]\n" "add v8.4s, v8.4s, v0.4s\n" - "prfm pstl1keep, [x27, #0x0]\n" - "ldr q1, [x16, #0x10]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x25, #0x0]\n" - "ldr q2, [x16, #0x20]\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" "add v16.4s, v16.4s, v0.4s\n" - "prfm pstl1keep, [x23, #0x0]\n" - "ldr q3, [x16, #0x30]\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" "add v20.4s, v20.4s, v0.4s\n" - "add x16, x16, #0x40\n" + "prfm pstl1keep, [x20, #0x0]\n" "add v24.4s, v24.4s, v0.4s\n" + "ldr q1, [x11, #0x10]\n" + "ldr q2, [x11, #0x20]\n" "add v9.4s, v9.4s, v1.4s\n" - "add v13.4s, v13.4s, v1.4s\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" "add v17.4s, v17.4s, v1.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" "add v18.4s, v18.4s, v2.4s\n" "add v19.4s, v19.4s, v3.4s\n" "add v21.4s, v21.4s, v1.4s\n" @@ -2385,30 +2333,30 @@ void a64_hybrid_s8qs_dot_6x16 ( "add v25.4s, v25.4s, v1.4s\n" "add v26.4s, v26.4s, v2.4s\n" "add v27.4s, v27.4s, v3.4s\n" - "tbz %x[flags], #4, 128f\n" - "ldr q0, [x17, #0x0]\n" - "ldr q4, [x8, #0x0]\n" - "ldr q1, [x17, #0x10]\n" - "ldr q5, [x8, #0x10]\n" - "ldr q2, [x17, #0x20]\n" - "ldr q6, [x8, #0x20]\n" - "ldr q3, [x17, #0x30]\n" - "ldr q7, [x8, #0x30]\n" - "add x17, x17, #0x40\n" - "add x8, x8, #0x40\n" - "b 129f\n" - "128:" // Height 5: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x19]\n" + "tbz %x[flags], #4, 123f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 124f\n" + "123:" // Height 5: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" "mov v2.16b, v0.16b\n" "mov v3.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v6.16b, v4.16b\n" "mov v7.16b, v4.16b\n" - "129:" // Height 5: parameters loaded + "124:" // Height 5: parameters loaded "sqrdmulh v8.4s, v8.4s, v4.4s\n" "sqrdmulh v9.4s, v9.4s, v5.4s\n" "sqrdmulh v10.4s, v10.4s, v6.4s\n" @@ -2429,7 +2377,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v25.4s, v25.4s, v5.4s\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v27.4s, v27.4s, v7.4s\n" - "tbz %x[flags], #5, 130f\n" + "tbz %x[flags], #5, 125f\n" "and v4.16b, v8.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" @@ -2490,18 +2438,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqadd v25.4s, v25.4s, v5.4s\n" "sqadd v26.4s, v26.4s, v6.4s\n" "sqadd v27.4s, v27.4s, v7.4s\n" - "130:" // Height 5: no shift correction + "125:" // Height 5: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" "srshl v10.4s, v10.4s, v2.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" "srshl v11.4s, v11.4s, v3.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x15, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" @@ -2593,158 +2541,134 @@ void a64_hybrid_s8qs_dot_6x16 ( "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v20.16b, v20.16b, v21.16b\n" "uzp1 v24.16b, v24.16b, v25.16b\n" - "bge 139f\n" - "tbz x15, #3, 134f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x15, #2, 132f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x15, #1, 131f\n" - "st1 { v8.h }[6], [x13], #0x2\n" - "st1 { v12.h }[6], [x9], #0x2\n" - "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x15, #0, 138f\n" - "st1 { v8.b }[14], [x13]\n" - "st1 { v12.b }[14], [x9]\n" - "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x25]\n" - "st1 { v24.b }[14], [x23]\n" - "b 138f\n" - "131:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x15, #0, 138f\n" - "st1 { v8.b }[12], [x13]\n" - "st1 { v12.b }[12], [x9]\n" - "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x25]\n" - "st1 { v24.b }[12], [x23]\n" - "b 138f\n" - "132:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x15, #1, 133f\n" - "st1 { v8.h }[4], [x13], #0x2\n" - "st1 { v12.h }[4], [x9], #0x2\n" - "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x15, #0, 138f\n" - "st1 { v8.b }[10], [x13]\n" - "st1 { v12.b }[10], [x9]\n" - "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x25]\n" - "st1 { v24.b }[10], [x23]\n" - "b 138f\n" - "133:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x15, #0, 138f\n" - "st1 { v8.b }[8], [x13]\n" - "st1 { v12.b }[8], [x9]\n" - "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x25]\n" - "st1 { v24.b }[8], [x23]\n" - "b 138f\n" - "134:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x15, #2, 136f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x15, #1, 135f\n" - "st1 { v8.h }[2], [x13], #0x2\n" - "st1 { v12.h }[2], [x9], #0x2\n" - "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x15, #0, 138f\n" - "st1 { v8.b }[6], [x13]\n" - "st1 { v12.b }[6], [x9]\n" - "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x25]\n" - "st1 { v24.b }[6], [x23]\n" - "b 138f\n" - "135:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x15, #0, 138f\n" - "st1 { v8.b }[4], [x13]\n" - "st1 { v12.b }[4], [x9]\n" - "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x25]\n" - "st1 { v24.b }[4], [x23]\n" - "b 138f\n" - "136:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x15, #1, 137f\n" - "str h8, [x13], #0x2\n" - "str h12, [x9], #0x2\n" - "str h16, [x27], #0x2\n" - "str h20, [x25], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x15, #0, 138f\n" - "st1 { v8.b }[2], [x13]\n" - "st1 { v12.b }[2], [x9]\n" - "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x25]\n" - "st1 { v24.b }[2], [x23]\n" - "b 138f\n" - "137:" // Height 5: Partial direct writeback: partial_1_0 - "str b8, [x13, #0x0]\n" - "str b12, [x9, #0x0]\n" - "str b16, [x27, #0x0]\n" - "str b20, [x25, #0x0]\n" - "str b24, [x23, #0x0]\n" - "138:" // Height 5: Partial direct writeback: Done - "b 140f\n" - "139:" // Height 5: Full writeback - "str q8, [x13, #0x0]\n" - "str q12, [x9, #0x0]\n" - "str q16, [x27, #0x0]\n" - "str q20, [x25, #0x0]\n" - "str q24, [x23, #0x0]\n" - "add x13, x13, #0x10\n" + "bge 134f\n" + "tbz x10, #3, 129f\n" + "str d8, [x9], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x10, #2, 127f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x10, #1, 126f\n" + "st1 { v8.h }[6], [x9], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x10, #0, 133f\n" + "st1 { v8.b }[14], [x9]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 133f\n" + "126:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x10, #0, 133f\n" + "st1 { v8.b }[12], [x9]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 133f\n" + "127:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x10, #1, 128f\n" + "st1 { v8.h }[4], [x9], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x10, #0, 133f\n" + "st1 { v8.b }[10], [x9]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 133f\n" + "128:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x10, #0, 133f\n" + "st1 { v8.b }[8], [x9]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 133f\n" + "129:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x10, #2, 131f\n" + "str s8, [x9], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x10, #1, 130f\n" + "st1 { v8.h }[2], [x9], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x10, #0, 133f\n" + "st1 { v8.b }[6], [x9]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 133f\n" + "130:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x10, #0, 133f\n" + "st1 { v8.b }[4], [x9]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 133f\n" + "131:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x10, #1, 132f\n" + "str h8, [x9], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x10, #0, 133f\n" + "st1 { v8.b }[2], [x9]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 133f\n" + "132:" // Height 5: Partial direct writeback: partial_1_0 + "str b8, [x9, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "133:" // Height 5: Partial direct writeback: Done + "b 135f\n" + "134:" // Height 5: Full writeback + "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "140:" // Height 5: Writeback done - "subs x15, x15, #0x10\n" - "bgt 115b\n" - "b 170f\n" - "141:" // Height 6 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "135:" // Height 5: Writeback done + "subs x10, x10, #0x10\n" + "bgt 110b\n" + "b 164f\n" + "136:" // Height 6 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x20, #0x6\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 142f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "add x21, x21, x19\n" - "b 143f\n" - "142:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "add x25, x27, x19\n" - "add x23, x25, x19\n" - "add x21, x23, x19\n" - "add %x[output_ptr], x21, x19\n" - "143:" // Height 6: Column loop + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "137:" // Height 6: Column loop "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2769,299 +2693,299 @@ void a64_hybrid_s8qs_dot_6x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "144:" // Height 6: setup done - "mov x12, #0x0\n" - "145:" // Height 6: String loop + "138:" // Height 6: setup done + "mov x27, #0x0\n" + "139:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 146f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 140f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 147f\n" + "cbnz x27, 141f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 147f\n" - "146:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "147:" // Height 6: input setup done - "cmp x11, #0x10\n" - "blt 150f\n" - "cmp x11, #0x20\n" - "blt 149f\n" - "148:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" + "b 141f\n" + "140:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "141:" // Height 6: input setup done + "cmp x26, #0x10\n" + "blt 144f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q6, [x28, #0x0]\n" + "blt 143f\n" + "142:" // Height 6: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x21, x21, #0x10\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x20, x20, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x20, x20, #0x10\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x11, x11, #0x10\n" + "cmp x26, #0x20\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "ldr q4, [x21, #0x0]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "bge 148b\n" - "149:" // Height 6: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "bge 142b\n" + "143:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x14, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" "add x22, x22, #0x10\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x21, x21, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x28, #0x20]\n" "add x20, x20, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x28, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x28, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x28, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x28, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x28, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x28, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x28, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x28, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x28, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x28, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x28, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" @@ -3073,37 +2997,37 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "150:" // Height 6: Multiply loop: Main loop skip - "cbz x11, 155f\n" - "cmp x11, #0x4\n" - "blt 152f\n" - "151:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" + "144:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 149f\n" + "cmp x26, #0x4\n" + "blt 146f\n" + "145:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" "ldr s5, [x20], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -3115,50 +3039,50 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "bge 151b\n" - "cbz x11, 155f\n" - "152:" // Height 6: Multiply loop: Skip odd blocks - "tbz x11, #1, 153f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" + "bge 145b\n" + "cbz x26, 149f\n" + "146:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #1, 147f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" "ldr h5, [x20], #0x2\n" - "tbz x11, #0, 154f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "ld1 { v4.b }[2], [x22]\n" + "tbz x26, #0, 148f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "ld1 { v4.b }[2], [x21]\n" "ld1 { v5.b }[2], [x20]\n" - "b 154f\n" - "153:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "ldr b4, [x22, #0x0]\n" + "b 148f\n" + "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "ldr b4, [x21, #0x0]\n" "ldr b5, [x20, #0x0]\n" - "154:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "148:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x28, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x28, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x28, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -3170,35 +3094,41 @@ void a64_hybrid_s8qs_dot_6x16 ( ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "155:" // Height 6: Multiply loop: No odd multiplies + "149:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 145b\n" - "prfm pstl1keep, [x13, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 139b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" "prfm pstl1keep, [x9, #0x0]\n" - "ldr q0, [x16, #0x0]\n" + "add x23, x9, x19\n" + "ldr q0, [x11, #0x0]\n" "add v8.4s, v8.4s, v0.4s\n" - "prfm pstl1keep, [x27, #0x0]\n" - "ldr q1, [x16, #0x10]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19\n" "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x25, #0x0]\n" - "ldr q2, [x16, #0x20]\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19\n" "add v16.4s, v16.4s, v0.4s\n" - "prfm pstl1keep, [x23, #0x0]\n" - "ldr q3, [x16, #0x30]\n" - "add v20.4s, v20.4s, v0.4s\n" "prfm pstl1keep, [x21, #0x0]\n" - "add x16, x16, #0x40\n" + "add x20, x21, x19\n" + "add v20.4s, v20.4s, v0.4s\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" "add v24.4s, v24.4s, v0.4s\n" + "prfm pstl1keep, [x19, #0x0]\n" "add v28.4s, v28.4s, v0.4s\n" + "ldr q1, [x11, #0x10]\n" + "ldr q2, [x11, #0x20]\n" "add v9.4s, v9.4s, v1.4s\n" + "ldr q3, [x11, #0x30]\n" + "add x11, x11, #0x40\n" "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" "add v13.4s, v13.4s, v1.4s\n" "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" "add v17.4s, v17.4s, v1.4s\n" + "add v11.4s, v11.4s, v3.4s\n" + "add v15.4s, v15.4s, v3.4s\n" "add v18.4s, v18.4s, v2.4s\n" "add v19.4s, v19.4s, v3.4s\n" "add v21.4s, v21.4s, v1.4s\n" @@ -3210,30 +3140,30 @@ void a64_hybrid_s8qs_dot_6x16 ( "add v29.4s, v29.4s, v1.4s\n" "add v30.4s, v30.4s, v2.4s\n" "add v31.4s, v31.4s, v3.4s\n" - "tbz %x[flags], #4, 156f\n" - "ldr q0, [x17, #0x0]\n" - "ldr q4, [x8, #0x0]\n" - "ldr q1, [x17, #0x10]\n" - "ldr q5, [x8, #0x10]\n" - "ldr q2, [x17, #0x20]\n" - "ldr q6, [x8, #0x20]\n" - "ldr q3, [x17, #0x30]\n" - "ldr q7, [x8, #0x30]\n" - "add x17, x17, #0x40\n" - "add x8, x8, #0x40\n" - "b 157f\n" - "156:" // Height 6: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x19]\n" + "tbz %x[flags], #4, 150f\n" + "ldr q0, [x12, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q1, [x12, #0x10]\n" + "ldr q5, [x13, #0x10]\n" + "ldr q2, [x12, #0x20]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q3, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + "b 151f\n" + "150:" // Height 6: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x24]\n" "mov v1.16b, v0.16b\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x24]\n" "mov v2.16b, v0.16b\n" "mov v3.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v6.16b, v4.16b\n" "mov v7.16b, v4.16b\n" - "157:" // Height 6: parameters loaded + "151:" // Height 6: parameters loaded "sqrdmulh v8.4s, v8.4s, v4.4s\n" "sqrdmulh v9.4s, v9.4s, v5.4s\n" "sqrdmulh v10.4s, v10.4s, v6.4s\n" @@ -3258,7 +3188,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v29.4s, v29.4s, v5.4s\n" "sqrdmulh v30.4s, v30.4s, v6.4s\n" "sqrdmulh v31.4s, v31.4s, v7.4s\n" - "tbz %x[flags], #5, 158f\n" + "tbz %x[flags], #5, 152f\n" "and v4.16b, v8.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v9.16b, v1.16b\n" @@ -3331,18 +3261,18 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqadd v29.4s, v29.4s, v5.4s\n" "sqadd v30.4s, v30.4s, v6.4s\n" "sqadd v31.4s, v31.4s, v7.4s\n" - "158:" // Height 6: no shift correction + "152:" // Height 6: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x24]\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" "srshl v10.4s, v10.4s, v2.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x24]\n" + "add x24, %x[qp], %[maxval]\n" "srshl v11.4s, v11.4s, v3.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x15, #0x10\n" + "ld1r { v6.4s }, [x24]\n" + "cmp x10, #0x10\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" @@ -3453,159 +3383,154 @@ void a64_hybrid_s8qs_dot_6x16 ( "uzp1 v20.16b, v20.16b, v21.16b\n" "uzp1 v24.16b, v24.16b, v25.16b\n" "uzp1 v28.16b, v28.16b, v29.16b\n" - "bge 167f\n" - "tbz x15, #3, 162f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x15, #2, 160f\n" - "st1 { v8.s }[2], [x13], #0x4\n" - "st1 { v12.s }[2], [x9], #0x4\n" - "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x21], #0x4\n" - "tbz x15, #1, 159f\n" - "st1 { v8.h }[6], [x13], #0x2\n" - "st1 { v12.h }[6], [x9], #0x2\n" - "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x21], #0x2\n" - "tbz x15, #0, 166f\n" - "st1 { v8.b }[14], [x13]\n" - "st1 { v12.b }[14], [x9]\n" - "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x25]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x21]\n" - "b 166f\n" - "159:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x15, #0, 166f\n" - "st1 { v8.b }[12], [x13]\n" - "st1 { v12.b }[12], [x9]\n" - "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x25]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x21]\n" - "b 166f\n" - "160:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x15, #1, 161f\n" - "st1 { v8.h }[4], [x13], #0x2\n" - "st1 { v12.h }[4], [x9], #0x2\n" - "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x21], #0x2\n" - "tbz x15, #0, 166f\n" - "st1 { v8.b }[10], [x13]\n" - "st1 { v12.b }[10], [x9]\n" - "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x25]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x21]\n" - "b 166f\n" - "161:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x15, #0, 166f\n" - "st1 { v8.b }[8], [x13]\n" - "st1 { v12.b }[8], [x9]\n" - "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x25]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x21]\n" - "b 166f\n" - "162:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x15, #2, 164f\n" - "str s8, [x13], #0x4\n" - "str s12, [x9], #0x4\n" - "str s16, [x27], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x21], #0x4\n" - "tbz x15, #1, 163f\n" - "st1 { v8.h }[2], [x13], #0x2\n" - "st1 { v12.h }[2], [x9], #0x2\n" - "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x21], #0x2\n" - "tbz x15, #0, 166f\n" - "st1 { v8.b }[6], [x13]\n" - "st1 { v12.b }[6], [x9]\n" - "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x25]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x21]\n" - "b 166f\n" - "163:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x15, #0, 166f\n" - "st1 { v8.b }[4], [x13]\n" - "st1 { v12.b }[4], [x9]\n" - "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x25]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x21]\n" - "b 166f\n" - "164:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x15, #1, 165f\n" - "str h8, [x13], #0x2\n" - "str h12, [x9], #0x2\n" - "str h16, [x27], #0x2\n" - "str h20, [x25], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x21], #0x2\n" - "tbz x15, #0, 166f\n" - "st1 { v8.b }[2], [x13]\n" - "st1 { v12.b }[2], [x9]\n" - "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x25]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x21]\n" - "b 166f\n" - "165:" // Height 6: Partial direct writeback: partial_1_0 - "str b8, [x13, #0x0]\n" - "str b12, [x9, #0x0]\n" - "str b16, [x27, #0x0]\n" - "str b20, [x25, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x21, #0x0]\n" - "166:" // Height 6: Partial direct writeback: Done - "b 168f\n" - "167:" // Height 6: Full writeback - "str q8, [x13, #0x0]\n" - "str q12, [x9, #0x0]\n" - "str q16, [x27, #0x0]\n" - "str q20, [x25, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x21, #0x0]\n" - "add x13, x13, #0x10\n" + "bge 161f\n" + "tbz x10, #3, 156f\n" + "str d8, [x9], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x10, #2, 154f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x23], #0x4\n" + "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v28.s }[2], [x19], #0x4\n" + "tbz x10, #1, 153f\n" + "st1 { v8.h }[6], [x9], #0x2\n" + "st1 { v12.h }[6], [x23], #0x2\n" + "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v28.h }[6], [x19], #0x2\n" + "tbz x10, #0, 160f\n" + "st1 { v8.b }[14], [x9]\n" + "st1 { v12.b }[14], [x23]\n" + "st1 { v16.b }[14], [x22]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "st1 { v28.b }[14], [x19]\n" + "b 160f\n" + "153:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x10, #0, 160f\n" + "st1 { v8.b }[12], [x9]\n" + "st1 { v12.b }[12], [x23]\n" + "st1 { v16.b }[12], [x22]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "st1 { v28.b }[12], [x19]\n" + "b 160f\n" + "154:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x10, #1, 155f\n" + "st1 { v8.h }[4], [x9], #0x2\n" + "st1 { v12.h }[4], [x23], #0x2\n" + "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v28.h }[4], [x19], #0x2\n" + "tbz x10, #0, 160f\n" + "st1 { v8.b }[10], [x9]\n" + "st1 { v12.b }[10], [x23]\n" + "st1 { v16.b }[10], [x22]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "st1 { v28.b }[10], [x19]\n" + "b 160f\n" + "155:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x10, #0, 160f\n" + "st1 { v8.b }[8], [x9]\n" + "st1 { v12.b }[8], [x23]\n" + "st1 { v16.b }[8], [x22]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "st1 { v28.b }[8], [x19]\n" + "b 160f\n" + "156:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x10, #2, 158f\n" + "str s8, [x9], #0x4\n" + "str s12, [x23], #0x4\n" + "str s16, [x22], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "str s28, [x19], #0x4\n" + "tbz x10, #1, 157f\n" + "st1 { v8.h }[2], [x9], #0x2\n" + "st1 { v12.h }[2], [x23], #0x2\n" + "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v28.h }[2], [x19], #0x2\n" + "tbz x10, #0, 160f\n" + "st1 { v8.b }[6], [x9]\n" + "st1 { v12.b }[6], [x23]\n" + "st1 { v16.b }[6], [x22]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "st1 { v28.b }[6], [x19]\n" + "b 160f\n" + "157:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x10, #0, 160f\n" + "st1 { v8.b }[4], [x9]\n" + "st1 { v12.b }[4], [x23]\n" + "st1 { v16.b }[4], [x22]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "st1 { v28.b }[4], [x19]\n" + "b 160f\n" + "158:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x10, #1, 159f\n" + "str h8, [x9], #0x2\n" + "str h12, [x23], #0x2\n" + "str h16, [x22], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "str h28, [x19], #0x2\n" + "tbz x10, #0, 160f\n" + "st1 { v8.b }[2], [x9]\n" + "st1 { v12.b }[2], [x23]\n" + "st1 { v16.b }[2], [x22]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "st1 { v28.b }[2], [x19]\n" + "b 160f\n" + "159:" // Height 6: Partial direct writeback: partial_1_0 + "str b8, [x9, #0x0]\n" + "str b12, [x23, #0x0]\n" + "str b16, [x22, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "str b28, [x19, #0x0]\n" + "160:" // Height 6: Partial direct writeback: Done + "b 162f\n" + "161:" // Height 6: Full writeback + "str q8, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "add x21, x21, #0x10\n" - "168:" // Height 6: Writeback done - "subs x15, x15, #0x10\n" - "bgt 143b\n" + "str q12, [x23, #0x0]\n" + "str q16, [x22, #0x0]\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "str q28, [x19, #0x0]\n" + "162:" // Height 6: Writeback done + "subs x10, x10, #0x10\n" + "bgt 137b\n" "subs %x[M], %x[M], #0x6\n" - "beq 170f\n" + "beq 164f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 169f\n" + "tbz %x[flags], #3, 163f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "169:" // Update direct input + "163:" // Update direct input "mov x19, #0x6\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "170:" // Exit + "164:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp index 6b3f84064c..759a78a413 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_s8s32_dot_6x16( ARGLIST ); +void a64_hybrid_s8s32_dot_6x16_a55( ARGLIST ); class cls_a64_hybrid_s8s32_dot_6x16 { @@ -72,10 +72,11 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: - return { 9.5238, 2.0799, 0.2279 }; + return { 12.667, 2.0799, 0.2279 }; default: return { 29.6736, 11.4025, 0.5591 }; } @@ -83,9 +84,15 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_s8s32_dot_6x16; - - cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *) + cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_hybrid_s8s32_dot_6x16_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp new file mode 100644 index 0000000000..6b2be0a33a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp @@ -0,0 +1,3499 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_s8s32_dot_6x16_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg output_arg, + const int32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const int8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 171f\n" + "cmp %x[M], #0x4\n" + "bgt 137f\n" + "beq 103f\n" + "cmp %x[M], #0x2\n" + "bgt 69f\n" + "beq 35f\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x12, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "tbz %x[flags], #0, 12f\n" + "cmp x14, #0x10\n" + "bge 11f\n" + "tbz x14, #3, 6f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "tbz x14, #2, 4f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "tbz x14, #1, 3f\n" + "mov x24, #0x38\n" + "ldr d11, [x12], #0x8\n" + "tbz x14, #0, 10f\n" + "ld1 { v11.s }[2], [x12]\n" + "b 10f\n" + "3:" // Height 1: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 10f\n" + "ldr s11, [x12, #0x0]\n" + "b 10f\n" + "4:" // Height 1: Partial accumulate: partial_2_8 + "tbz x14, #1, 5f\n" + "ldr d10, [x12], #0x8\n" + "mov x24, #0x28\n" + "tbz x14, #0, 10f\n" + "ld1 { v10.s }[2], [x12]\n" + "b 10f\n" + "5:" // Height 1: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 10f\n" + "ldr s10, [x12, #0x0]\n" + "b 10f\n" + "6:" // Height 1: Partial accumulate: partial_4_0 + "tbz x14, #2, 8f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "tbz x14, #1, 7f\n" + "mov x24, #0x18\n" + "ldr d9, [x12], #0x8\n" + "tbz x14, #0, 10f\n" + "ld1 { v9.s }[2], [x12]\n" + "b 10f\n" + "7:" // Height 1: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 10f\n" + "ldr s9, [x12, #0x0]\n" + "b 10f\n" + "8:" // Height 1: Partial accumulate: partial_2_0 + "tbz x14, #1, 9f\n" + "ldr d8, [x12], #0x8\n" + "mov x24, #0x8\n" + "tbz x14, #0, 10f\n" + "ld1 { v8.s }[2], [x12]\n" + "b 10f\n" + "9:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "10:" // Height 1: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 13f\n" + "11:" // Height 1: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "b 13f\n" + "12:" // Height 1: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "13:" // Height 1: setup done + "mov x11, #0x0\n" + "14:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 15f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "cbnz x11, 16f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "b 16f\n" + "15:" // Height 1: setup direct input + "mov x9, %x[input_ptr]\n" + "16:" // Height 1: input setup done + "cmp x10, #0x10\n" + "blt 19f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "cmp x10, #0x20\n" + "blt 18f\n" + "17:" // Height 1: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "ldr x19, [x13, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "ldr x28, [x13, #0x28]\n" + "cmp x10, #0x20\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x13, #0x38]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr d6, [x13, #0x40]\n" + "ldr x28, [x13, #0x48]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0x58]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x13, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x13, #0x60]\n" + "ldr x28, [x13, #0x68]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0x78]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x13, #0x80]\n" + "ldr x28, [x13, #0x88]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0x98]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x13, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x13, #0xa0]\n" + "ldr x28, [x13, #0xa8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xb8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x28, [x13, #0xc8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xd8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x13, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xf8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x13, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + "mov v7.d[1], x19\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "mov v0.d[1], x27\n" + "bge 17b\n" + "18:" // Height 1: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "ldr q6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x13, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x13, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "19:" // Height 1: Multiply loop: Main loop skip + "cbz x10, 24f\n" + "cmp x10, #0x4\n" + "blt 21f\n" + "20:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr q6, [x13, #0x0]\n" + "cmp x10, #0x4\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "bge 20b\n" + "cbz x10, 24f\n" + "21:" // Height 1: Multiply loop: Skip odd blocks + "tbz x10, #1, 22f\n" + "ldr h0, [x9], #0x2\n" + "tbz x10, #0, 23f\n" + "ld1 { v0.b }[2], [x9]\n" + "b 23f\n" + "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "23:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "24:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 14b\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "bge 33f\n" + "tbz x14, #3, 28f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "tbz x14, #2, 26f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "tbz x14, #1, 25f\n" + "str d11, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v11.s }[2], [x12]\n" + "b 32f\n" + "25:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x14, #0, 32f\n" + "str s11, [x12, #0x0]\n" + "b 32f\n" + "26:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x14, #1, 27f\n" + "str d10, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v10.s }[2], [x12]\n" + "b 32f\n" + "27:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x14, #0, 32f\n" + "str s10, [x12, #0x0]\n" + "b 32f\n" + "28:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x14, #2, 30f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "tbz x14, #1, 29f\n" + "str d9, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v9.s }[2], [x12]\n" + "b 32f\n" + "29:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x14, #0, 32f\n" + "str s9, [x12, #0x0]\n" + "b 32f\n" + "30:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x14, #1, 31f\n" + "str d8, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v8.s }[2], [x12]\n" + "b 32f\n" + "31:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "32:" // Height 1: Partial direct writeback: Done + "b 34f\n" + "33:" // Height 1: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "34:" // Height 1: Writeback done + "subs x14, x14, #0x10\n" + "bgt 2b\n" + "b 206f\n" + "35:" // Height 2 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "36:" // Height 2: Column loop + "tbz %x[flags], #0, 46f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "bge 45f\n" + "tbz x14, #3, 40f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "tbz x14, #2, 38f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x14, #1, 37f\n" + "mov x24, #0x38\n" + "ldr d11, [x12], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x14, #0, 44f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 44f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x14, #1, 39f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "tbz x14, #0, 44f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 44f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x14, #2, 42f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "tbz x14, #1, 41f\n" + "mov x24, #0x18\n" + "ldr d9, [x12], #0x8\n" + "ldr d13, [x23], #0x8\n" + "tbz x14, #0, 44f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 44f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x14, #1, 43f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "tbz x14, #0, 44f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "44:" // Height 2: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "b 47f\n" + "46:" // Height 2: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "47:" // Height 2: setup done + "mov x11, #0x0\n" + "48:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "cbnz x11, 50f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "50:" // Height 2: input setup done + "cmp x10, #0x10\n" + "blt 53f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q6, [x13, #0x0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x13, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "sub x10, x10, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "cmp x10, #0x20\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x28]\n" + "ldr x19, [x13, #0x38]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x40]\n" + "ldr x28, [x13, #0x48]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr d7, [x13, #0x50]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0x58]\n" + "ldr x28, [x13, #0x68]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x60]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0x70]\n" + "ldr x28, [x13, #0x88]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x80]\n" + "ldr x19, [x13, #0x98]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0x90]\n" + "ldr x28, [x13, #0xa8]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xa0]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xb0]\n" + "ldr x28, [x13, #0xc8]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x19, [x13, #0xd8]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xd0]\n" + "ldr x28, [x13, #0xe8]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x19, [x13, #0xf8]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x27, [x9, #0x8]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr x25, [x26, #0x8]\n" + "ldr d6, [x13, #0x0]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x25\n" + "mov v6.d[1], x28\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x13, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x13, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x13, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x13, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x10, 58f\n" + "cmp x10, #0x4\n" + "blt 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "bge 54b\n" + "cbz x10, 58f\n" + "55:" // Height 2: Multiply loop: Skip odd blocks + "tbz x10, #1, 56f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "tbz x10, #0, 57f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "b 57f\n" + "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "57:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "58:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 48b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "bge 67f\n" + "tbz x14, #3, 62f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "tbz x14, #2, 60f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "tbz x14, #1, 59f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "b 66f\n" + "59:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x14, #0, 66f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "b 66f\n" + "60:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x14, #1, 61f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "b 66f\n" + "61:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x14, #0, 66f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "b 66f\n" + "62:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x14, #2, 64f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "tbz x14, #1, 63f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "b 66f\n" + "63:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x14, #0, 66f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "b 66f\n" + "64:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x14, #1, 65f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "b 66f\n" + "65:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "66:" // Height 2: Partial direct writeback: Done + "b 68f\n" + "67:" // Height 2: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "68:" // Height 2: Writeback done + "subs x14, x14, #0x10\n" + "bgt 36b\n" + "b 206f\n" + "69:" // Height 3 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "70:" // Height 3: Column loop + "tbz %x[flags], #0, 80f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 79f\n" + "tbz x14, #3, 74f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x14, #2, 72f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x14, #1, 71f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 78f\n" + "71:" // Height 3: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 78f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 78f\n" + "72:" // Height 3: Partial accumulate: partial_2_8 + "tbz x14, #1, 73f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 78f\n" + "73:" // Height 3: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 78f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 78f\n" + "74:" // Height 3: Partial accumulate: partial_4_0 + "tbz x14, #2, 76f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "tbz x14, #1, 75f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 78f\n" + "75:" // Height 3: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 78f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "b 78f\n" + "76:" // Height 3: Partial accumulate: partial_2_0 + "tbz x14, #1, 77f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "b 78f\n" + "77:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "78:" // Height 3: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 81f\n" + "79:" // Height 3: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "b 81f\n" + "80:" // Height 3: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "81:" // Height 3: setup done + "mov x11, #0x0\n" + "82:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 83f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "cbnz x11, 84f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 84f\n" + "83:" // Height 3: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "84:" // Height 3: input setup done + "cmp x10, #0x10\n" + "blt 87f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 86f\n" + "85:" // Height 3: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "ldr x19, [x13, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x26, x26, #0x10\n" + "ldr x28, [x13, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x13, #0x38]\n" + "sub x10, x10, #0x10\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "cmp x10, #0x20\n" + "ldr d6, [x13, #0x40]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x48]\n" + "ldr x19, [x13, #0x58]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr d7, [x13, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x60]\n" + "ldr x28, [x13, #0x68]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0x78]\n" + "ldr x28, [x13, #0x88]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + "ldr d6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x13, #0x90]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0x98]\n" + "ldr x28, [x13, #0xa8]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x13, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0xb8]\n" + "ldr x28, [x13, #0xc8]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x13, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x13, #0xd0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0xd8]\n" + "ldr x28, [x13, #0xe8]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + "ldr d6, [x13, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0xf8]\n" + "ldr x27, [x9, #0x8]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + "ldr x25, [x26, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr x21, [x24, #0x8]\n" + "mov v1.d[1], x25\n" + "ldr d6, [x13, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + "mov v2.d[1], x21\n" + "mov v6.d[1], x28\n" + "bge 85b\n" + "86:" // Height 3: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "ldr q6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr q7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x13, #0x50]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q6, [x13, #0x60]\n" + "ldr q7, [x13, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q6, [x13, #0x80]\n" + "ldr q7, [x13, #0x90]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q6, [x13, #0xa0]\n" + "ldr q7, [x13, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q6, [x13, #0xc0]\n" + "ldr q7, [x13, #0xd0]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q6, [x13, #0xe0]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "87:" // Height 3: Multiply loop: Main loop skip + "cbz x10, 92f\n" + "cmp x10, #0x4\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "bge 88b\n" + "cbz x10, 92f\n" + "89:" // Height 3: Multiply loop: Skip odd blocks + "tbz x10, #1, 90f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "tbz x10, #0, 91f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "b 91f\n" + "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "91:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + "92:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 82b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "bge 101f\n" + "tbz x14, #3, 96f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x14, #2, 94f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x14, #1, 93f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 100f\n" + "93:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x14, #0, 100f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 100f\n" + "94:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x14, #1, 95f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 100f\n" + "95:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x14, #0, 100f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 100f\n" + "96:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x14, #2, 98f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x14, #1, 97f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 100f\n" + "97:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x14, #0, 100f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 100f\n" + "98:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x14, #1, 99f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 100f\n" + "99:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "100:" // Height 3: Partial direct writeback: Done + "b 102f\n" + "101:" // Height 3: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "102:" // Height 3: Writeback done + "subs x14, x14, #0x10\n" + "bgt 70b\n" + "b 206f\n" + "103:" // Height 4 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "104:" // Height 4: Column loop + "tbz %x[flags], #0, 114f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 113f\n" + "tbz x14, #3, 108f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x14, #2, 106f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x14, #1, 105f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 112f\n" + "105:" // Height 4: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 112f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 112f\n" + "106:" // Height 4: Partial accumulate: partial_2_8 + "tbz x14, #1, 107f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 112f\n" + "107:" // Height 4: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 112f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 112f\n" + "108:" // Height 4: Partial accumulate: partial_4_0 + "tbz x14, #2, 110f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x14, #1, 109f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 112f\n" + "109:" // Height 4: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 112f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 112f\n" + "110:" // Height 4: Partial accumulate: partial_2_0 + "tbz x14, #1, 111f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 112f\n" + "111:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "112:" // Height 4: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 115f\n" + "113:" // Height 4: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "b 115f\n" + "114:" // Height 4: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "115:" // Height 4: setup done + "mov x11, #0x0\n" + "116:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 117f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "cbnz x11, 118f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 118f\n" + "117:" // Height 4: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "118:" // Height 4: input setup done + "cmp x10, #0x10\n" + "blt 121f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 120f\n" + "119:" // Height 4: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x13, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x23, x23, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x28]\n" + "ldr x19, [x13, #0x38]\n" + "sub x10, x10, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr d6, [x13, #0x40]\n" + "cmp x10, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "ldr x28, [x13, #0x48]\n" + "ldr d7, [x13, #0x50]\n" + "ldr x19, [x13, #0x58]\n" + "mov v6.d[1], x28\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr x28, [x13, #0x68]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x60]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + "ldr x28, [x13, #0x88]\n" + "ldr d7, [x13, #0x90]\n" + "ldr x19, [x13, #0x98]\n" + "mov v6.d[1], x28\n" + "ldr x28, [x13, #0xa8]\n" + "ldr x27, [x9, #0x8]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xa0]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x28, [x13, #0xc8]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xd0]\n" + "ldr x19, [x13, #0xd8]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x19, [x13, #0xf8]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr x25, [x26, #0x8]\n" + "mov v0.d[1], x27\n" + "ldr d2, [x24, #0x0]\n" + "ldr x21, [x24, #0x8]\n" + "mov v1.d[1], x25\n" + "ldr d3, [x23, #0x0]\n" + "ldr x19, [x23, #0x8]\n" + "mov v2.d[1], x21\n" + "mov v3.d[1], x19\n" + "bge 119b\n" + "120:" // Height 4: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x13, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x13, #0x90]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q7, [x13, #0xd0]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "121:" // Height 4: Multiply loop: Main loop skip + "cbz x10, 126f\n" + "cmp x10, #0x4\n" + "blt 123f\n" + "122:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "bge 122b\n" + "cbz x10, 126f\n" + "123:" // Height 4: Multiply loop: Skip odd blocks + "tbz x10, #1, 124f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "tbz x10, #0, 125f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "b 125f\n" + "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "125:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + "126:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 116b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "bge 135f\n" + "tbz x14, #3, 130f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "tbz x14, #2, 128f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "tbz x14, #1, 127f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "b 134f\n" + "127:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x14, #0, 134f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "b 134f\n" + "128:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x14, #1, 129f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "b 134f\n" + "129:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x14, #0, 134f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "b 134f\n" + "130:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x14, #2, 132f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "tbz x14, #1, 131f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "b 134f\n" + "131:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x14, #0, 134f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "b 134f\n" + "132:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x14, #1, 133f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "b 134f\n" + "133:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "134:" // Height 4: Partial direct writeback: Done + "b 136f\n" + "135:" // Height 4: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "136:" // Height 4: Writeback done + "subs x14, x14, #0x10\n" + "bgt 104b\n" + "b 206f\n" + "137:" // Height 5 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "138:" // Height 5: Column loop + "tbz %x[flags], #0, 148f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 147f\n" + "tbz x14, #3, 142f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "tbz x14, #2, 140f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x14, #1, 139f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 146f\n" + "139:" // Height 5: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 146f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 146f\n" + "140:" // Height 5: Partial accumulate: partial_2_8 + "tbz x14, #1, 141f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 146f\n" + "141:" // Height 5: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 146f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "b 146f\n" + "142:" // Height 5: Partial accumulate: partial_4_0 + "tbz x14, #2, 144f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "tbz x14, #1, 143f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "b 146f\n" + "143:" // Height 5: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 146f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "b 146f\n" + "144:" // Height 5: Partial accumulate: partial_2_0 + "tbz x14, #1, 145f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "b 146f\n" + "145:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "146:" // Height 5: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 149f\n" + "147:" // Height 5: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "b 149f\n" + "148:" // Height 5: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "149:" // Height 5: setup done + "mov x11, #0x0\n" + "150:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x11, 152f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "152:" // Height 5: input setup done + "cmp x10, #0x10\n" + "blt 155f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "ldr x19, [x13, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x26, x26, #0x10\n" + "ldr x28, [x13, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x13, #0x38]\n" + "add x23, x23, #0x10\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x13, #0x40]\n" + "sub x10, x10, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x48]\n" + "ldr x19, [x13, #0x58]\n" + "cmp x10, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr d7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr d6, [x13, #0x60]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x68]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x13, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x88]\n" + "ldr x19, [x13, #0x98]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr d7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d6, [x13, #0xa0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0xa8]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x28, [x13, #0xc8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xd8]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr d7, [x13, #0xd0]\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xf8]\n" + "ldr x25, [x26, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x21, [x24, #0x8]\n" + "add x13, x13, #0x100\n" + "ldr d6, [x13, #0x0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x8]\n" + "ldr x19, [x23, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr d3, [x23, #0x0]\n" + "mov v1.d[1], x25\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x21\n" + "ldr x21, [x22, #0x8]\n" + "mov v3.d[1], x19\n" + "mov v4.d[1], x21\n" + "bge 153b\n" + "154:" // Height 5: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "ldr q6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr q7, [x13, #0x30]\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x22, x22, #0x10\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q7, [x13, #0xd0]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "155:" // Height 5: Multiply loop: Main loop skip + "cbz x10, 160f\n" + "cmp x10, #0x4\n" + "blt 157f\n" + "156:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "bge 156b\n" + "cbz x10, 160f\n" + "157:" // Height 5: Multiply loop: Skip odd blocks + "tbz x10, #1, 158f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x10, #0, 159f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 159f\n" + "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "159:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + "160:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 150b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "bge 169f\n" + "tbz x14, #3, 164f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x14, #2, 162f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x14, #1, 161f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 168f\n" + "161:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x14, #0, 168f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 168f\n" + "162:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x14, #1, 163f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 168f\n" + "163:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x14, #0, 168f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 168f\n" + "164:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x14, #2, 166f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x14, #1, 165f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 168f\n" + "165:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x14, #0, 168f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 168f\n" + "166:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x14, #1, 167f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 168f\n" + "167:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "168:" // Height 5: Partial direct writeback: Done + "b 170f\n" + "169:" // Height 5: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "170:" // Height 5: Writeback done + "subs x14, x14, #0x10\n" + "bgt 138b\n" + "b 206f\n" + "171:" // Height 6 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "172:" // Height 6: Column loop + "tbz %x[flags], #0, 182f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "add x19, x20, x19, LSL #2\n" + "bge 181f\n" + "tbz x14, #3, 176f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v29.4s }, [x19], #0x10\n" + "tbz x14, #2, 174f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v30.4s }, [x19], #0x10\n" + "tbz x14, #1, 173f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "ldr d31, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "ld1 { v31.s }[2], [x19]\n" + "b 180f\n" + "173:" // Height 6: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 180f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "ldr s31, [x19, #0x0]\n" + "b 180f\n" + "174:" // Height 6: Partial accumulate: partial_2_8 + "tbz x14, #1, 175f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "ldr d30, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "ld1 { v30.s }[2], [x19]\n" + "b 180f\n" + "175:" // Height 6: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 180f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "ldr s30, [x19, #0x0]\n" + "b 180f\n" + "176:" // Height 6: Partial accumulate: partial_4_0 + "tbz x14, #2, 178f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "tbz x14, #1, 177f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "ldr d29, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "ld1 { v29.s }[2], [x19]\n" + "b 180f\n" + "177:" // Height 6: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 180f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "ldr s29, [x19, #0x0]\n" + "b 180f\n" + "178:" // Height 6: Partial accumulate: partial_2_0 + "tbz x14, #1, 179f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "ldr d28, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x19]\n" + "b 180f\n" + "179:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "ldr s28, [x19, #0x0]\n" + "180:" // Height 6: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 183f\n" + "181:" // Height 6: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "ldr q28, [x19, #0x0]\n" + "ldr q29, [x19, #0x10]\n" + "ldr q30, [x19, #0x20]\n" + "ldr q31, [x19, #0x30]\n" + "b 183f\n" + "182:" // Height 6: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "183:" // Height 6: setup done + "mov x11, #0x0\n" + "184:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 185f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x11, 186f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 186f\n" + "185:" // Height 6: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x20, x22, x19\n" + "186:" // Height 6: input setup done + "cmp x10, #0x10\n" + "blt 189f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 188f\n" + "187:" // Height 6: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x13, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x23, x23, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x28]\n" + "ldr x19, [x13, #0x38]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x20, x20, #0x10\n" + "ldr d6, [x13, #0x40]\n" + "sub x10, x10, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x48]\n" + "ldr x19, [x13, #0x58]\n" + "cmp x10, #0x20\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr d7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr d6, [x13, #0x60]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x68]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x13, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x88]\n" + "ldr x19, [x13, #0x98]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr d7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d6, [x13, #0xa0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0xa8]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr d6, [x13, #0xc0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0xc8]\n" + "ldr x19, [x13, #0xd8]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr d7, [x13, #0xd0]\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xf8]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x25, [x26, #0x8]\n" + "add x13, x13, #0x100\n" + "ldr d6, [x13, #0x0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x8]\n" + "ldr x21, [x24, #0x8]\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr d3, [x23, #0x0]\n" + "mov v1.d[1], x25\n" + "ldr x19, [x23, #0x8]\n" + "mov v2.d[1], x21\n" + "ldr d4, [x22, #0x0]\n" + "ldr x21, [x22, #0x8]\n" + "mov v3.d[1], x19\n" + "ldr d5, [x20, #0x0]\n" + "ldr x19, [x20, #0x8]\n" + "mov v4.d[1], x21\n" + "mov v5.d[1], x19\n" + "bge 187b\n" + "188:" // Height 6: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x13, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" + "ldr q6, [x13, #0x40]\n" + "add x22, x22, #0x10\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q7, [x13, #0x50]\n" + "add x20, x20, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" + ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" + ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q7, [x13, #0xd0]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" + ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" + "189:" // Height 6: Multiply loop: Main loop skip + "cbz x10, 194f\n" + "cmp x10, #0x4\n" + "blt 191f\n" + "190:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "bge 190b\n" + "cbz x10, 194f\n" + "191:" // Height 6: Multiply loop: Skip odd blocks + "tbz x10, #1, 192f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "tbz x10, #0, 193f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x20]\n" + "b 193f\n" + "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x20, #0x0]\n" + "193:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + "194:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 184b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19, LSL #2\n" + "prfm pstl1keep, [x19, #0x0]\n" + "bge 203f\n" + "tbz x14, #3, 198f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "st1 { v29.4s }, [x19], #0x10\n" + "tbz x14, #2, 196f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "st1 { v30.4s }, [x19], #0x10\n" + "tbz x14, #1, 195f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "str d31, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "st1 { v31.s }[2], [x19]\n" + "b 202f\n" + "195:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x14, #0, 202f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "str s31, [x19, #0x0]\n" + "b 202f\n" + "196:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x14, #1, 197f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "str d30, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "st1 { v30.s }[2], [x19]\n" + "b 202f\n" + "197:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x14, #0, 202f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "str s30, [x19, #0x0]\n" + "b 202f\n" + "198:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x14, #2, 200f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "tbz x14, #1, 199f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "str d29, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19]\n" + "b 202f\n" + "199:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x14, #0, 202f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "str s29, [x19, #0x0]\n" + "b 202f\n" + "200:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x14, #1, 201f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "st1 { v28.s }[2], [x19]\n" + "b 202f\n" + "201:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "str s28, [x19, #0x0]\n" + "202:" // Height 6: Partial direct writeback: Done + "b 204f\n" + "203:" // Height 6: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "str q28, [x19, #0x0]\n" + "str q29, [x19, #0x10]\n" + "str q30, [x19, #0x20]\n" + "str q31, [x19, #0x30]\n" + "204:" // Height 6: Writeback done + "subs x14, x14, #0x10\n" + "bgt 172b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 206f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 205f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "205:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "206:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp index 0423a9de11..3566027a50 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp @@ -80,392 +80,378 @@ void a64_hybrid_s8s32_dot_6x16 ( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 176f\n" + "bge 171f\n" "cmp %x[M], #0x4\n" - "bgt 141f\n" - "beq 106f\n" + "bgt 137f\n" + "beq 103f\n" "cmp %x[M], #0x2\n" - "bgt 71f\n" - "beq 36f\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop - "tbz %x[flags], #0, 13f\n" - "cmp x15, #0x10\n" - "bge 12f\n" - "tbz x15, #3, 7f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "tbz x15, #2, 5f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "tbz x15, #1, 4f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "tbz x15, #0, 11f\n" - "ld1 { v11.s }[2], [x13]\n" - "b 11f\n" - "4:" // Height 1: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 11f\n" - "ldr s11, [x13, #0x0]\n" - "b 11f\n" - "5:" // Height 1: Partial accumulate: partial_2_8 - "tbz x15, #1, 6f\n" - "ldr d10, [x13], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 11f\n" - "ld1 { v10.s }[2], [x13]\n" - "b 11f\n" - "6:" // Height 1: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 11f\n" - "ldr s10, [x13, #0x0]\n" - "b 11f\n" - "7:" // Height 1: Partial accumulate: partial_4_0 - "tbz x15, #2, 9f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "tbz x15, #1, 8f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "tbz x15, #0, 11f\n" - "ld1 { v9.s }[2], [x13]\n" - "b 11f\n" - "8:" // Height 1: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 11f\n" - "ldr s9, [x13, #0x0]\n" - "b 11f\n" - "9:" // Height 1: Partial accumulate: partial_2_0 - "tbz x15, #1, 10f\n" - "ldr d8, [x13], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 11f\n" - "ld1 { v8.s }[2], [x13]\n" - "b 11f\n" - "10:" // Height 1: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "11:" // Height 1: Partial accumulate: Done - "sub x13, x13, x19\n" - "b 14f\n" - "12:" // Height 1: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "b 14f\n" - "13:" // Height 1: no accumulate + "bgt 69f\n" + "beq 35f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "tbz %x[flags], #0, 12f\n" + "cmp x10, #0x10\n" + "bge 11f\n" + "tbz x10, #3, 6f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #2, 4f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #1, 3f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 10f\n" + "3:" // Height 1: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 10f\n" + "ldr s11, [x28, #0x0]\n" + "b 10f\n" + "4:" // Height 1: Partial accumulate: partial_2_8 + "tbz x10, #1, 5f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 10f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 10f\n" + "5:" // Height 1: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 10f\n" + "ldr s10, [x28, #0x0]\n" + "b 10f\n" + "6:" // Height 1: Partial accumulate: partial_4_0 + "tbz x10, #2, 8f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "tbz x10, #1, 7f\n" + "ldr d9, [x28], #0x8\n" + "mov x24, #0x18\n" + "tbz x10, #0, 10f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 10f\n" + "7:" // Height 1: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 10f\n" + "ldr s9, [x28, #0x0]\n" + "b 10f\n" + "8:" // Height 1: Partial accumulate: partial_2_0 + "tbz x10, #1, 9f\n" + "ldr d8, [x28], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v8.s }[2], [x28]\n" + "b 10f\n" + "9:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "10:" // Height 1: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 13f\n" + "11:" // Height 1: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "b 13f\n" + "12:" // Height 1: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" "movi v11.4s, #0x0\n" - "14:" // Height 1: setup done - "mov x12, #0x0\n" - "15:" // Height 1: String loop + "13:" // Height 1: setup done + "mov x27, #0x0\n" + "14:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 16f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 15f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 17f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 16f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "b 17f\n" - "16:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "17:" // Height 1: input setup done - "cmp x11, #0x10\n" - "blt 20f\n" - "cmp x11, #0x20\n" + "add x25, x25, x19\n" + "b 16f\n" + "15:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "16:" // Height 1: input setup done + "cmp x26, #0x10\n" "blt 19f\n" - "18:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "cmp x26, #0x20\n" + "blt 18f\n" + "17:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x9, #0x20]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x9, #0x30]\n" + "cmp x26, #0x20\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q6, [x9, #0x40]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x60]\n" - "sub x11, x11, #0x10\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x70]\n" - "cmp x11, #0x20\n" + "ldr q7, [x9, #0x70]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - "bge 18b\n" - "19:" // Height 1: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "bge 17b\n" + "18:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x9, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x9, #0x30]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - "20:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 25f\n" - "cmp x11, #0x4\n" - "blt 22f\n" - "21:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "19:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 24f\n" + "cmp x26, #0x4\n" + "blt 21f\n" + "20:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "cmp x26, #0x4\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x4\n" - "add x14, x14, #0x40\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "cmp x11, #0x4\n" - "bge 21b\n" - "cbz x11, 25f\n" - "22:" // Height 1: Multiply loop: Skip odd blocks - "tbz x11, #1, 23f\n" - "ldr h0, [x10], #0x2\n" - "tbz x11, #0, 24f\n" - "ld1 { v0.b }[2], [x10]\n" - "b 24f\n" - "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "24:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 20b\n" + "cbz x26, 24f\n" + "21:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #1, 22f\n" + "ldr h0, [x25], #0x2\n" + "tbz x26, #0, 23f\n" + "ld1 { v0.b }[2], [x25]\n" + "b 23f\n" + "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "23:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "25:" // Height 1: Multiply loop: No odd multiplies + "24:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 15b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "cmp x15, #0x10\n" - "bge 34f\n" - "tbz x15, #3, 29f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "tbz x15, #2, 27f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "tbz x15, #1, 26f\n" - "str d11, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v11.s }[2], [x13]\n" - "b 33f\n" - "26:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 33f\n" - "str s11, [x13, #0x0]\n" - "b 33f\n" - "27:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 28f\n" - "str d10, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v10.s }[2], [x13]\n" - "b 33f\n" - "28:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 33f\n" - "str s10, [x13, #0x0]\n" - "b 33f\n" - "29:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 31f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "tbz x15, #1, 30f\n" - "str d9, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v9.s }[2], [x13]\n" - "b 33f\n" - "30:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 33f\n" - "str s9, [x13, #0x0]\n" - "b 33f\n" - "31:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 32f\n" - "str d8, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v8.s }[2], [x13]\n" - "b 33f\n" - "32:" // Height 1: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "33:" // Height 1: Partial direct writeback: Done - "b 35f\n" - "34:" // Height 1: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "add x13, x13, #0x40\n" - "35:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" - "bgt 3b\n" - "b 212f\n" - "36:" // Height 2 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 14b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "bge 33f\n" + "tbz x10, #3, 28f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #2, 26f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #1, 25f\n" + "str d11, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v11.s }[2], [x28]\n" + "b 32f\n" + "25:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 32f\n" + "str s11, [x28, #0x0]\n" + "b 32f\n" + "26:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 27f\n" + "str d10, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v10.s }[2], [x28]\n" + "b 32f\n" + "27:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 32f\n" + "str s10, [x28, #0x0]\n" + "b 32f\n" + "28:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 30f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x10, #1, 29f\n" + "str d9, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v9.s }[2], [x28]\n" + "b 32f\n" + "29:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 32f\n" + "str s9, [x28, #0x0]\n" + "b 32f\n" + "30:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 31f\n" + "str d8, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v8.s }[2], [x28]\n" + "b 32f\n" + "31:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "32:" // Height 1: Partial direct writeback: Done + "b 34f\n" + "33:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "34:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" + "bgt 2b\n" + "b 206f\n" + "35:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "36:" // Height 2: Column loop + "tbz %x[flags], #0, 46f\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 37f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 38f\n" - "37:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "38:" // Height 2: Column loop - "tbz %x[flags], #0, 48f\n" - "cmp x15, #0x10\n" - "bge 47f\n" - "tbz x15, #3, 42f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "tbz x15, #2, 40f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "tbz x15, #1, 39f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "tbz x15, #0, 46f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "b 46f\n" - "39:" // Height 2: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 46f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "b 46f\n" - "40:" // Height 2: Partial accumulate: partial_2_8 - "tbz x15, #1, 41f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 46f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "b 46f\n" - "41:" // Height 2: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 46f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "b 46f\n" - "42:" // Height 2: Partial accumulate: partial_4_0 - "tbz x15, #2, 44f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "tbz x15, #1, 43f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "tbz x15, #0, 46f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "b 46f\n" - "43:" // Height 2: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 46f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "b 46f\n" - "44:" // Height 2: Partial accumulate: partial_2_0 - "tbz x15, #1, 45f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 46f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "b 46f\n" - "45:" // Height 2: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "46:" // Height 2: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "b 49f\n" - "47:" // Height 2: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "b 49f\n" - "48:" // Height 2: no accumulate + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "bge 45f\n" + "tbz x10, #3, 40f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "tbz x10, #2, 38f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x10, #1, 37f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x10, #0, 44f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 44f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x10, #1, 39f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 44f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 44f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x10, #2, 42f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "tbz x10, #1, 41f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "tbz x10, #0, 44f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 44f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x10, #1, 43f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 44f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "44:" // Height 2: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "b 47f\n" + "46:" // Height 2: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -474,405 +460,395 @@ void a64_hybrid_s8s32_dot_6x16 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "49:" // Height 2: setup done - "mov x12, #0x0\n" - "50:" // Height 2: String loop + "47:" // Height 2: setup done + "mov x27, #0x0\n" + "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 51f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 52f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 50f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "b 52f\n" - "51:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "52:" // Height 2: input setup done - "cmp x11, #0x10\n" - "blt 55f\n" - "cmp x11, #0x20\n" - "blt 54f\n" - "53:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "50:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 53f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q6, [x9, #0x0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x9, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "cmp x26, #0x20\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "cmp x11, #0x20\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "bge 53b\n" - "54:" // Height 2: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x9, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "55:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 60f\n" - "cmp x11, #0x4\n" - "blt 57f\n" - "56:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 58f\n" + "cmp x26, #0x4\n" + "blt 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "sub x11, x11, #0x4\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "bge 56b\n" - "cbz x11, 60f\n" - "57:" // Height 2: Multiply loop: Skip odd blocks - "tbz x11, #1, 58f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "tbz x11, #0, 59f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "b 59f\n" - "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "59:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 54b\n" + "cbz x26, 58f\n" + "55:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #1, 56f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "tbz x26, #0, 57f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "b 57f\n" + "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "57:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "60:" // Height 2: Multiply loop: No odd multiplies + "58:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 50b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "bge 69f\n" - "tbz x15, #3, 64f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "tbz x15, #2, 62f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "tbz x15, #1, 61f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "b 68f\n" - "61:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 68f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "b 68f\n" - "62:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 63f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "b 68f\n" - "63:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 68f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "b 68f\n" - "64:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 66f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "tbz x15, #1, 65f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "b 68f\n" - "65:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 68f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "b 68f\n" - "66:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 67f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 48b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "bge 67f\n" + "tbz x10, #3, 62f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "tbz x10, #2, 60f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "tbz x10, #1, 59f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "b 66f\n" + "59:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 66f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "b 66f\n" + "60:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 61f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "b 66f\n" + "61:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 66f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "b 66f\n" + "62:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 64f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "tbz x10, #1, 63f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "b 66f\n" + "63:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 66f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "b 66f\n" + "64:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 65f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "b 66f\n" + "65:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "66:" // Height 2: Partial direct writeback: Done "b 68f\n" - "67:" // Height 2: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "68:" // Height 2: Partial direct writeback: Done - "b 70f\n" - "69:" // Height 2: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "70:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" - "bgt 38b\n" - "b 212f\n" - "71:" // Height 3 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "67:" // Height 2: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "68:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 36b\n" + "b 206f\n" + "69:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "70:" // Height 3: Column loop + "tbz %x[flags], #0, 80f\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 73f\n" - "72:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "73:" // Height 3: Column loop - "tbz %x[flags], #0, 83f\n" - "cmp x15, #0x10\n" - "bge 82f\n" - "tbz x15, #3, 77f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "tbz x15, #2, 75f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "tbz x15, #1, 74f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "tbz x15, #0, 81f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "b 81f\n" - "74:" // Height 3: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 81f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "b 81f\n" - "75:" // Height 3: Partial accumulate: partial_2_8 - "tbz x15, #1, 76f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 81f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 79f\n" + "tbz x10, #3, 74f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #2, 72f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #1, 71f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 78f\n" + "71:" // Height 3: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 78f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 78f\n" + "72:" // Height 3: Partial accumulate: partial_2_8 + "tbz x10, #1, 73f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 78f\n" + "73:" // Height 3: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 78f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 78f\n" + "74:" // Height 3: Partial accumulate: partial_4_0 + "tbz x10, #2, 76f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "tbz x10, #1, 75f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 78f\n" + "75:" // Height 3: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 78f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "b 78f\n" + "76:" // Height 3: Partial accumulate: partial_2_0 + "tbz x10, #1, 77f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "b 78f\n" + "77:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "78:" // Height 3: Partial accumulate: Done + "sub x28, x28, x24\n" "b 81f\n" - "76:" // Height 3: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 81f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" + "79:" // Height 3: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" "b 81f\n" - "77:" // Height 3: Partial accumulate: partial_4_0 - "tbz x15, #2, 79f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "tbz x15, #1, 78f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "tbz x15, #0, 81f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "b 81f\n" - "78:" // Height 3: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 81f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "b 81f\n" - "79:" // Height 3: Partial accumulate: partial_2_0 - "tbz x15, #1, 80f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 81f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "b 81f\n" - "80:" // Height 3: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "81:" // Height 3: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "b 84f\n" - "82:" // Height 3: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "b 84f\n" - "83:" // Height 3: no accumulate + "80:" // Height 3: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -885,504 +861,491 @@ void a64_hybrid_s8s32_dot_6x16 ( "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "84:" // Height 3: setup done - "mov x12, #0x0\n" - "85:" // Height 3: String loop + "81:" // Height 3: setup done + "mov x27, #0x0\n" + "82:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 86f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 83f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 87f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 84f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" - "b 87f\n" - "86:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "87:" // Height 3: input setup done - "cmp x11, #0x10\n" - "blt 90f\n" - "cmp x11, #0x20\n" - "blt 89f\n" - "88:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 84f\n" + "83:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "84:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 87f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 86f\n" + "85:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "bge 88b\n" - "89:" // Height 3: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "bge 85b\n" + "86:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x9, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "90:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 95f\n" - "cmp x11, #0x4\n" - "blt 92f\n" - "91:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "87:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 92f\n" + "cmp x26, #0x4\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "cmp x11, #0x4\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "bge 91b\n" - "cbz x11, 95f\n" - "92:" // Height 3: Multiply loop: Skip odd blocks - "tbz x11, #1, 93f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "tbz x11, #0, 94f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "b 94f\n" - "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "94:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 88b\n" + "cbz x26, 92f\n" + "89:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #1, 90f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "tbz x26, #0, 91f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "b 91f\n" + "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "91:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "95:" // Height 3: Multiply loop: No odd multiplies + "92:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 85b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "bge 104f\n" - "tbz x15, #3, 99f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "tbz x15, #2, 97f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "tbz x15, #1, 96f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "b 103f\n" - "96:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 103f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "b 103f\n" - "97:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 98f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "b 103f\n" - "98:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 103f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "b 103f\n" - "99:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 101f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "tbz x15, #1, 100f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "b 103f\n" - "100:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 103f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "b 103f\n" - "101:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 102f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "b 103f\n" - "102:" // Height 3: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "103:" // Height 3: Partial direct writeback: Done - "b 105f\n" - "104:" // Height 3: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "105:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" - "bgt 73b\n" - "b 212f\n" - "106:" // Height 4 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 82b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 107f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 108f\n" - "107:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "108:" // Height 4: Column loop - "tbz %x[flags], #0, 118f\n" - "cmp x15, #0x10\n" - "bge 117f\n" - "tbz x15, #3, 112f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "tbz x15, #2, 110f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "tbz x15, #1, 109f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "tbz x15, #0, 116f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "b 116f\n" - "109:" // Height 4: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 116f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "b 116f\n" - "110:" // Height 4: Partial accumulate: partial_2_8 - "tbz x15, #1, 111f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 116f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "b 116f\n" - "111:" // Height 4: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 116f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "b 116f\n" - "112:" // Height 4: Partial accumulate: partial_4_0 - "tbz x15, #2, 114f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "tbz x15, #1, 113f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "tbz x15, #0, 116f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "b 116f\n" - "113:" // Height 4: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 116f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "b 116f\n" - "114:" // Height 4: Partial accumulate: partial_2_0 - "tbz x15, #1, 115f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 116f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "b 116f\n" - "115:" // Height 4: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "116:" // Height 4: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "b 119f\n" - "117:" // Height 4: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "b 119f\n" - "118:" // Height 4: no accumulate + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "bge 101f\n" + "tbz x10, #3, 96f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #2, 94f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #1, 93f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 100f\n" + "93:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 100f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 100f\n" + "94:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 95f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 100f\n" + "95:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 100f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 100f\n" + "96:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 98f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x10, #1, 97f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 100f\n" + "97:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 100f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 100f\n" + "98:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 99f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 100f\n" + "99:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "100:" // Height 3: Partial direct writeback: Done + "b 102f\n" + "101:" // Height 3: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "102:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 70b\n" + "b 206f\n" + "103:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "104:" // Height 4: Column loop + "tbz %x[flags], #0, 114f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 113f\n" + "tbz x10, #3, 108f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x10, #2, 106f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x10, #1, 105f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 112f\n" + "105:" // Height 4: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 112f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 112f\n" + "106:" // Height 4: Partial accumulate: partial_2_8 + "tbz x10, #1, 107f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 112f\n" + "107:" // Height 4: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 112f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 112f\n" + "108:" // Height 4: Partial accumulate: partial_4_0 + "tbz x10, #2, 110f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x10, #1, 109f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 112f\n" + "109:" // Height 4: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 112f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 112f\n" + "110:" // Height 4: Partial accumulate: partial_2_0 + "tbz x10, #1, 111f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 112f\n" + "111:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "112:" // Height 4: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 115f\n" + "113:" // Height 4: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "b 115f\n" + "114:" // Height 4: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1399,220 +1362,220 @@ void a64_hybrid_s8s32_dot_6x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "119:" // Height 4: setup done - "mov x12, #0x0\n" - "120:" // Height 4: String loop + "115:" // Height 4: setup done + "mov x27, #0x0\n" + "116:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 121f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 117f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 122f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 118f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" - "b 122f\n" - "121:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "122:" // Height 4: input setup done - "cmp x11, #0x10\n" - "blt 125f\n" - "cmp x11, #0x20\n" - "blt 124f\n" - "123:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 118f\n" + "117:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "118:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 121f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 120f\n" + "119:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x11, x11, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "bge 123b\n" - "124:" // Height 4: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "bge 119b\n" + "120:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x9, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" @@ -1621,31 +1584,31 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "125:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 130f\n" - "cmp x11, #0x4\n" - "blt 127f\n" - "126:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "121:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 126f\n" + "cmp x26, #0x4\n" + "blt 123f\n" + "122:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -1653,40 +1616,40 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "bge 126b\n" - "cbz x11, 130f\n" - "127:" // Height 4: Multiply loop: Skip odd blocks - "tbz x11, #1, 128f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "tbz x11, #0, 129f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "b 129f\n" - "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "129:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 122b\n" + "cbz x26, 126f\n" + "123:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #1, 124f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "tbz x26, #0, 125f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "b 125f\n" + "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "125:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -1694,308 +1657,292 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "130:" // Height 4: Multiply loop: No odd multiplies + "126:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 120b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "bge 139f\n" - "tbz x15, #3, 134f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "tbz x15, #2, 132f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "tbz x15, #1, 131f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "b 138f\n" - "131:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 138f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "b 138f\n" - "132:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 133f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "b 138f\n" - "133:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 138f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "b 138f\n" - "134:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 136f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "tbz x15, #1, 135f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "b 138f\n" - "135:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 138f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "b 138f\n" - "136:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 137f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "b 138f\n" - "137:" // Height 4: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "138:" // Height 4: Partial direct writeback: Done - "b 140f\n" - "139:" // Height 4: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "140:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" - "bgt 108b\n" - "b 212f\n" - "141:" // Height 5 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 116b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 142f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 143f\n" - "142:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "143:" // Height 5: Column loop - "tbz %x[flags], #0, 153f\n" - "cmp x15, #0x10\n" - "bge 152f\n" - "tbz x15, #3, 147f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "tbz x15, #2, 145f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "tbz x15, #1, 144f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "tbz x15, #0, 151f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "b 151f\n" - "144:" // Height 5: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 151f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "b 151f\n" - "145:" // Height 5: Partial accumulate: partial_2_8 - "tbz x15, #1, 146f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 151f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "b 151f\n" - "146:" // Height 5: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 151f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "b 151f\n" - "147:" // Height 5: Partial accumulate: partial_4_0 - "tbz x15, #2, 149f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "tbz x15, #1, 148f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "tbz x15, #0, 151f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "b 151f\n" - "148:" // Height 5: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 151f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "b 151f\n" - "149:" // Height 5: Partial accumulate: partial_2_0 - "tbz x15, #1, 150f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 151f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "b 151f\n" - "150:" // Height 5: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "151:" // Height 5: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "b 154f\n" - "152:" // Height 5: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "b 154f\n" - "153:" // Height 5: no accumulate + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "bge 135f\n" + "tbz x10, #3, 130f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "tbz x10, #2, 128f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "tbz x10, #1, 127f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "b 134f\n" + "127:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 134f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "b 134f\n" + "128:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 129f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "b 134f\n" + "129:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 134f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "b 134f\n" + "130:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 132f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "tbz x10, #1, 131f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "b 134f\n" + "131:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 134f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "b 134f\n" + "132:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 133f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "b 134f\n" + "133:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "134:" // Height 4: Partial direct writeback: Done + "b 136f\n" + "135:" // Height 4: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "136:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 104b\n" + "b 206f\n" + "137:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "138:" // Height 5: Column loop + "tbz %x[flags], #0, 148f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 147f\n" + "tbz x10, #3, 142f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #2, 140f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #1, 139f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 146f\n" + "139:" // Height 5: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 146f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 146f\n" + "140:" // Height 5: Partial accumulate: partial_2_8 + "tbz x10, #1, 141f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 146f\n" + "141:" // Height 5: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 146f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "b 146f\n" + "142:" // Height 5: Partial accumulate: partial_4_0 + "tbz x10, #2, 144f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "tbz x10, #1, 143f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "b 146f\n" + "143:" // Height 5: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 146f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "b 146f\n" + "144:" // Height 5: Partial accumulate: partial_2_0 + "tbz x10, #1, 145f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "b 146f\n" + "145:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "146:" // Height 5: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 149f\n" + "147:" // Height 5: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "b 149f\n" + "148:" // Height 5: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2016,260 +1963,260 @@ void a64_hybrid_s8s32_dot_6x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "154:" // Height 5: setup done - "mov x12, #0x0\n" - "155:" // Height 5: String loop + "149:" // Height 5: setup done + "mov x27, #0x0\n" + "150:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 156f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 157f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 152f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 157f\n" - "156:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "157:" // Height 5: input setup done - "cmp x11, #0x10\n" - "blt 160f\n" - "cmp x11, #0x20\n" - "blt 159f\n" - "158:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x21, x21, x19\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "152:" // Height 5: input setup done + "cmp x26, #0x10\n" + "blt 155f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x11, x11, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "bge 158b\n" - "159:" // Height 5: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "bge 153b\n" + "154:" // Height 5: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x9, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" @@ -2279,34 +2226,34 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "160:" // Height 5: Multiply loop: Main loop skip - "cbz x11, 165f\n" - "cmp x11, #0x4\n" - "blt 162f\n" - "161:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "155:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 160f\n" + "cmp x26, #0x4\n" + "blt 157f\n" + "156:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -2316,45 +2263,45 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "bge 161b\n" - "cbz x11, 165f\n" - "162:" // Height 5: Multiply loop: Skip odd blocks - "tbz x11, #1, 163f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" - "tbz x11, #0, 164f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "ld1 { v4.b }[2], [x22]\n" - "b 164f\n" - "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "ldr b4, [x22, #0x0]\n" - "164:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 156b\n" + "cbz x26, 160f\n" + "157:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #1, 158f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" + "tbz x26, #0, 159f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "ld1 { v4.b }[2], [x21]\n" + "b 159f\n" + "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "ldr b4, [x21, #0x0]\n" + "159:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -2364,356 +2311,338 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "165:" // Height 5: Multiply loop: No odd multiplies + "160:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 155b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 150b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" - "bge 174f\n" - "tbz x15, #3, 169f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "tbz x15, #2, 167f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "tbz x15, #1, 166f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "b 173f\n" - "166:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x15, #0, 173f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "b 173f\n" - "167:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x15, #1, 168f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "b 173f\n" - "168:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x15, #0, 173f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "b 173f\n" - "169:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x15, #2, 171f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "tbz x15, #1, 170f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "b 173f\n" - "170:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x15, #0, 173f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "b 173f\n" - "171:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x15, #1, 172f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "b 173f\n" - "172:" // Height 5: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "173:" // Height 5: Partial direct writeback: Done - "b 175f\n" - "174:" // Height 5: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "175:" // Height 5: Writeback done - "subs x15, x15, #0x10\n" - "bgt 143b\n" - "b 212f\n" - "176:" // Height 6 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "bge 169f\n" + "tbz x10, #3, 164f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #2, 162f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #1, 161f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 168f\n" + "161:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x10, #0, 168f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 168f\n" + "162:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x10, #1, 163f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 168f\n" + "163:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x10, #0, 168f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 168f\n" + "164:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x10, #2, 166f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x10, #1, 165f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 168f\n" + "165:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x10, #0, 168f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 168f\n" + "166:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x10, #1, 167f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 168f\n" + "167:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "168:" // Height 5: Partial direct writeback: Done + "b 170f\n" + "169:" // Height 5: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "170:" // Height 5: Writeback done + "subs x10, x10, #0x10\n" + "bgt 138b\n" + "b 206f\n" + "171:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 177f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 178f\n" - "177:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "178:" // Height 6: Column loop - "tbz %x[flags], #0, 188f\n" - "cmp x15, #0x10\n" - "bge 187f\n" - "tbz x15, #3, 182f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v29.4s }, [x21], #0x10\n" - "tbz x15, #2, 180f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "ld1 { v30.4s }, [x21], #0x10\n" - "tbz x15, #1, 179f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d31, [x21], #0x8\n" - "tbz x15, #0, 186f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x21]\n" - "b 186f\n" - "179:" // Height 6: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 186f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "ldr s31, [x21, #0x0]\n" - "b 186f\n" - "180:" // Height 6: Partial accumulate: partial_2_8 - "tbz x15, #1, 181f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d30, [x21], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 186f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x21]\n" - "b 186f\n" - "181:" // Height 6: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 186f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "ldr s30, [x21, #0x0]\n" - "b 186f\n" - "182:" // Height 6: Partial accumulate: partial_4_0 - "tbz x15, #2, 184f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "tbz x15, #1, 183f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d29, [x21], #0x8\n" - "tbz x15, #0, 186f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "ld1 { v29.s }[2], [x21]\n" - "b 186f\n" - "183:" // Height 6: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 186f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "ldr s29, [x21, #0x0]\n" - "b 186f\n" - "184:" // Height 6: Partial accumulate: partial_2_0 - "tbz x15, #1, 185f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d28, [x21], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 186f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "ld1 { v28.s }[2], [x21]\n" - "b 186f\n" - "185:" // Height 6: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "ldr s28, [x21, #0x0]\n" - "186:" // Height 6: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "sub x21, x21, x19\n" - "b 189f\n" - "187:" // Height 6: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "ldr q28, [x21, #0x0]\n" - "ldr q29, [x21, #0x10]\n" - "ldr q30, [x21, #0x20]\n" - "ldr q31, [x21, #0x30]\n" - "b 189f\n" - "188:" // Height 6: no accumulate + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "172:" // Height 6: Column loop + "tbz %x[flags], #0, 182f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "add x19, x20, x19, LSL #2\n" + "bge 181f\n" + "tbz x10, #3, 176f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v29.4s }, [x19], #0x10\n" + "tbz x10, #2, 174f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v30.4s }, [x19], #0x10\n" + "tbz x10, #1, 173f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "ldr d31, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "ld1 { v31.s }[2], [x19]\n" + "b 180f\n" + "173:" // Height 6: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 180f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "ldr s31, [x19, #0x0]\n" + "b 180f\n" + "174:" // Height 6: Partial accumulate: partial_2_8 + "tbz x10, #1, 175f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "ldr d30, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "ld1 { v30.s }[2], [x19]\n" + "b 180f\n" + "175:" // Height 6: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 180f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "ldr s30, [x19, #0x0]\n" + "b 180f\n" + "176:" // Height 6: Partial accumulate: partial_4_0 + "tbz x10, #2, 178f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "tbz x10, #1, 177f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "ldr d29, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "ld1 { v29.s }[2], [x19]\n" + "b 180f\n" + "177:" // Height 6: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 180f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "ldr s29, [x19, #0x0]\n" + "b 180f\n" + "178:" // Height 6: Partial accumulate: partial_2_0 + "tbz x10, #1, 179f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "ldr d28, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x19]\n" + "b 180f\n" + "179:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "ldr s28, [x19, #0x0]\n" + "180:" // Height 6: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 183f\n" + "181:" // Height 6: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "ldr q28, [x19, #0x0]\n" + "ldr q29, [x19, #0x10]\n" + "ldr q30, [x19, #0x20]\n" + "ldr q31, [x19, #0x30]\n" + "b 183f\n" + "182:" // Height 6: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2738,299 +2667,299 @@ void a64_hybrid_s8s32_dot_6x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "189:" // Height 6: setup done - "mov x12, #0x0\n" - "190:" // Height 6: String loop + "183:" // Height 6: setup done + "mov x27, #0x0\n" + "184:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 191f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 185f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 192f\n" + "cbnz x27, 186f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 192f\n" - "191:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "192:" // Height 6: input setup done - "cmp x11, #0x10\n" - "blt 195f\n" - "cmp x11, #0x20\n" - "blt 194f\n" - "193:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" + "b 186f\n" + "185:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "186:" // Height 6: input setup done + "cmp x26, #0x10\n" + "blt 189f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 188f\n" + "187:" // Height 6: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x21, x21, #0x10\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x20, x20, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "add x20, x20, #0x10\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x11, x11, #0x10\n" + "cmp x26, #0x20\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + "ldr q4, [x21, #0x0]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "bge 193b\n" - "194:" // Height 6: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "bge 187b\n" + "188:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x14, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" "add x22, x22, #0x10\n" + ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "add x21, x21, #0x10\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x9, #0x20]\n" "add x20, x20, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" @@ -3042,37 +2971,37 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "195:" // Height 6: Multiply loop: Main loop skip - "cbz x11, 200f\n" - "cmp x11, #0x4\n" - "blt 197f\n" - "196:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" + "189:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 194f\n" + "cmp x26, #0x4\n" + "blt 191f\n" + "190:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" "ldr s5, [x20], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -3084,50 +3013,50 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "bge 196b\n" - "cbz x11, 200f\n" - "197:" // Height 6: Multiply loop: Skip odd blocks - "tbz x11, #1, 198f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" + "bge 190b\n" + "cbz x26, 194f\n" + "191:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #1, 192f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" "ldr h5, [x20], #0x2\n" - "tbz x11, #0, 199f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "ld1 { v4.b }[2], [x22]\n" + "tbz x26, #0, 193f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "ld1 { v4.b }[2], [x21]\n" "ld1 { v5.b }[2], [x20]\n" - "b 199f\n" - "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "ldr b4, [x22, #0x0]\n" + "b 193f\n" + "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "ldr b4, [x21, #0x0]\n" "ldr b5, [x20, #0x0]\n" - "199:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "193:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" @@ -3139,195 +3068,196 @@ void a64_hybrid_s8s32_dot_6x16 ( ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" - "200:" // Height 6: Multiply loop: No odd multiplies + "194:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 190b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 184b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" "prfm pstl1keep, [x21, #0x0]\n" - "bge 209f\n" - "tbz x15, #3, 204f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "st1 { v29.4s }, [x21], #0x10\n" - "tbz x15, #2, 202f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "st1 { v30.4s }, [x21], #0x10\n" - "tbz x15, #1, 201f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "st1 { v31.s }[2], [x21]\n" - "b 208f\n" - "201:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x15, #0, 208f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "str s31, [x21, #0x0]\n" - "b 208f\n" - "202:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x15, #1, 203f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "str d30, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "st1 { v30.s }[2], [x21]\n" - "b 208f\n" - "203:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x15, #0, 208f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "str s30, [x21, #0x0]\n" - "b 208f\n" - "204:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x15, #2, 206f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "tbz x15, #1, 205f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "str d29, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "st1 { v29.s }[2], [x21]\n" - "b 208f\n" - "205:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x15, #0, 208f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "str s29, [x21, #0x0]\n" - "b 208f\n" - "206:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x15, #1, 207f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "st1 { v28.s }[2], [x21]\n" - "b 208f\n" - "207:" // Height 6: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "str s28, [x21, #0x0]\n" - "208:" // Height 6: Partial direct writeback: Done - "b 210f\n" - "209:" // Height 6: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "str q28, [x21, #0x0]\n" - "str q29, [x21, #0x10]\n" - "str q30, [x21, #0x20]\n" - "str q31, [x21, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "add x21, x21, #0x40\n" - "210:" // Height 6: Writeback done - "subs x15, x15, #0x10\n" - "bgt 178b\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19, LSL #2\n" + "prfm pstl1keep, [x19, #0x0]\n" + "bge 203f\n" + "tbz x10, #3, 198f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "st1 { v29.4s }, [x19], #0x10\n" + "tbz x10, #2, 196f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "st1 { v30.4s }, [x19], #0x10\n" + "tbz x10, #1, 195f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "str d31, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "st1 { v31.s }[2], [x19]\n" + "b 202f\n" + "195:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x10, #0, 202f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "str s31, [x19, #0x0]\n" + "b 202f\n" + "196:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x10, #1, 197f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "str d30, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "st1 { v30.s }[2], [x19]\n" + "b 202f\n" + "197:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x10, #0, 202f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "str s30, [x19, #0x0]\n" + "b 202f\n" + "198:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x10, #2, 200f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "tbz x10, #1, 199f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "str d29, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19]\n" + "b 202f\n" + "199:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x10, #0, 202f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "str s29, [x19, #0x0]\n" + "b 202f\n" + "200:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x10, #1, 201f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "st1 { v28.s }[2], [x19]\n" + "b 202f\n" + "201:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "str s28, [x19, #0x0]\n" + "202:" // Height 6: Partial direct writeback: Done + "b 204f\n" + "203:" // Height 6: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "str q28, [x19, #0x0]\n" + "str q29, [x19, #0x10]\n" + "str q30, [x19, #0x20]\n" + "str q31, [x19, #0x30]\n" + "204:" // Height 6: Writeback done + "subs x10, x10, #0x10\n" + "bgt 172b\n" "subs %x[M], %x[M], #0x6\n" - "beq 212f\n" + "beq 206f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 211f\n" + "tbz %x[flags], #3, 205f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "211:" // Update direct input + "205:" // Update direct input "mov x19, #0x6\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "212:" // Exit + "206:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp index acf46205a3..5d9d84815a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_u8qa_dot_4x16( ARGLIST ); +void a64_hybrid_u8qa_dot_4x16_a55( ARGLIST ); class cls_a64_hybrid_u8qa_dot_4x16 { @@ -72,7 +72,8 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: return { 7.5301 }; @@ -83,9 +84,15 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_u8qa_dot_4x16; - - cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *) + cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_hybrid_u8qa_dot_4x16_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp new file mode 100644 index 0000000000..690085cb3b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp @@ -0,0 +1,2148 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include +#include + +namespace arm_gemm { + +void a64_hybrid_u8qa_dot_4x16_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const Requantize32 *qp, const int32_t *col_bias, unsigned int +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + if (qp->c_offset > qp->minval) { + flags |= 0x20; + } + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x4\n" + "bge 91f\n" + "cmp %x[M], #0x2\n" + "bgt 61f\n" + "beq 31f\n" + "movi v11.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "3:" // Height 1: setup done + "mov x27, #0x0\n" + "4:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 6f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x26, #0x10\n" + "blt 11f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "cmp x26, #0x20\n" + "blt 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "ldr x24, [x10, #0x18]\n" + "add x25, x25, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "ldr x23, [x10, #0x28]\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "mov v7.d[1], x19\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr x23, [x10, #0x48]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + "ldr d9, [x10, #0x50]\n" + "ldr x19, [x10, #0x58]\n" + "mov v8.d[1], x23\n" + "ldr d10, [x10, #0x60]\n" + "ldr x23, [x10, #0x68]\n" + "mov v9.d[1], x19\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + "ldr d4, [x10, #0x70]\n" + "mov v10.d[1], x23\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + "ldr x19, [x10, #0x78]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + "ldr d5, [x10, #0x80]\n" + "ldr x24, [x10, #0x88]\n" + "mov v4.d[1], x19\n" + "ldr d6, [x10, #0x90]\n" + "ldr x23, [x10, #0x98]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr d7, [x10, #0xa0]\n" + "mov v6.d[1], x23\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr x19, [x10, #0xa8]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr d8, [x10, #0xb0]\n" + "ldr x23, [x10, #0xb8]\n" + "mov v7.d[1], x19\n" + "ldr d9, [x10, #0xc0]\n" + "ldr x19, [x10, #0xc8]\n" + "mov v8.d[1], x23\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + "ldr d10, [x10, #0xd0]\n" + "mov v9.d[1], x19\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "ldr x23, [x10, #0xd8]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + "ldr d4, [x10, #0xe0]\n" + "ldr x19, [x10, #0xe8]\n" + "mov v10.d[1], x23\n" + "ldr d5, [x10, #0xf0]\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + "mov v4.d[1], x19\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 8f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "ldr q0, [x25, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q4, [x10, #0x0]\n" + "bge 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x25, x25, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q9, [x10, #0x50]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q10, [x10, #0x60]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q8, [x10, #0xb0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q9, [x10, #0xc0]\n" + "ldr q10, [x10, #0xd0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q4, [x10, #0xe0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + "ldr q5, [x10, #0xf0]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "add x10, x10, #0x100\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 10f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "10:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "11:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 18f\n" + "cmp x26, #0x4\n" + "blt 14f\n" + "12:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "tbnz %x[flags], #31, 13f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "13:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + "bge 12b\n" + "cbz x26, 18f\n" + "14:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #1, 15f\n" + "ldr h0, [x25], #0x2\n" + "tbz x26, #0, 16f\n" + "ld1 { v0.b }[2], [x25]\n" + "b 16f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "16:" // Height 1: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 17f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "17:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + "18:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 4b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "tbnz %x[flags], #31, 19f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v1.4s }, [x22]\n" + "neg v1.4s, v1.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "mul v11.4s, v11.4s, v1.4s\n" + "19:" // Height 1: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "ldr q3, [x9, #0x30]\n" + "add v17.4s, v17.4s, v1.4s\n" + "ld1r { v0.4s }, [x23]\n" + "add v18.4s, v18.4s, v2.4s\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add x9, x9, #0x40\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "tbz %x[flags], #5, 20f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "20:" // Height 1: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "bge 29f\n" + "tbz x11, #3, 24f\n" + "str d16, [x28], #0x8\n" + "tbz x11, #2, 22f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "tbz x11, #1, 21f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[14], [x28]\n" + "b 28f\n" + "21:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 28f\n" + "st1 { v16.b }[12], [x28]\n" + "b 28f\n" + "22:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 23f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[10], [x28]\n" + "b 28f\n" + "23:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 28f\n" + "st1 { v16.b }[8], [x28]\n" + "b 28f\n" + "24:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 26f\n" + "str s16, [x28], #0x4\n" + "tbz x11, #1, 25f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[6], [x28]\n" + "b 28f\n" + "25:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 28f\n" + "st1 { v16.b }[4], [x28]\n" + "b 28f\n" + "26:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 27f\n" + "str h16, [x28], #0x2\n" + "tbz x11, #0, 28f\n" + "st1 { v16.b }[2], [x28]\n" + "b 28f\n" + "27:" // Height 1: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "28:" // Height 1: Partial direct writeback: Done + "b 30f\n" + "29:" // Height 1: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "30:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 122f\n" + "31:" // Height 2 + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "32:" // Height 2: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "33:" // Height 2: setup done + "mov x27, #0x0\n" + "34:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 35f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x27, 36f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x22, x22, x19\n" + "b 36f\n" + "35:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x22, x25, x19\n" + "36:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 41f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q4, [x10, #0x0]\n" + "blt 39f\n" + "37:" // Height 2: Multiply loop: Main loop head + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "add x25, x25, #0x10\n" + "ldr x24, [x10, #0x18]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "ldr x23, [x10, #0x28]\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "mov v7.d[1], x19\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "ldr x23, [x10, #0x48]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + "ldr d9, [x10, #0x50]\n" + "mov v8.d[1], x23\n" + "ldr x19, [x10, #0x58]\n" + "ldr d10, [x10, #0x60]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + "mov v9.d[1], x19\n" + "ldr x23, [x10, #0x68]\n" + "ldr d4, [x10, #0x70]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + "mov v10.d[1], x23\n" + "ldr x19, [x10, #0x78]\n" + "ldr d5, [x10, #0x80]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + "mov v4.d[1], x19\n" + "ldr x24, [x10, #0x88]\n" + "ldr d6, [x10, #0x90]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "mov v5.d[1], x24\n" + "ldr x23, [x10, #0x98]\n" + "ldr d7, [x10, #0xa0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "mov v6.d[1], x23\n" + "ldr x19, [x10, #0xa8]\n" + "ldr d8, [x10, #0xb0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr x23, [x10, #0xb8]\n" + "ldr d9, [x10, #0xc0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + "mov v8.d[1], x23\n" + "ldr x19, [x10, #0xc8]\n" + "ldr d10, [x10, #0xd0]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + "mov v9.d[1], x19\n" + "ldr x23, [x10, #0xd8]\n" + "ldr d4, [x10, #0xe0]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + "mov v10.d[1], x23\n" + "ldr x19, [x10, #0xe8]\n" + "ldr d5, [x10, #0xf0]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + "mov v4.d[1], x19\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 38f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "38:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "bge 37b\n" + "39:" // Height 2: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "sub x26, x26, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + "ldr q9, [x10, #0x50]\n" + "ldr q10, [x10, #0x60]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + "ldr q7, [x10, #0xa0]\n" + "ldr q8, [x10, #0xb0]\n" + "ldr q9, [x10, #0xc0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + "ldr q10, [x10, #0xd0]\n" + "ldr q4, [x10, #0xe0]\n" + "ldr q5, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 40f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "40:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "41:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 48f\n" + "cmp x26, #0x4\n" + "blt 44f\n" + "42:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "ldr s1, [x22], #0x4\n" + "tbnz %x[flags], #31, 43f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "43:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + "bge 42b\n" + "cbz x26, 48f\n" + "44:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #1, 45f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x22], #0x2\n" + "tbz x26, #0, 46f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x22]\n" + "b 46f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "46:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 47f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "47:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + "48:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 34b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x21, x28, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 49f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x22]\n" + "neg v2.4s, v2.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "mul v11.4s, v11.4s, v2.4s\n" + "mul v12.4s, v12.4s, v2.4s\n" + "49:" // Height 2: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "add v20.4s, v20.4s, v12.4s\n" + "add v21.4s, v21.4s, v12.4s\n" + "add v22.4s, v22.4s, v12.4s\n" + "add v23.4s, v23.4s, v12.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "ldr q3, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "tbz %x[flags], #5, 50f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "and v9.16b, v21.16b, v0.16b\n" + "and v10.16b, v22.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "and v4.16b, v23.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "50:" // Height 2: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "bge 59f\n" + "tbz x11, #3, 54f\n" + "str d16, [x28], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x11, #2, 52f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz x11, #1, 51f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[14], [x28]\n" + "st1 { v20.b }[14], [x21]\n" + "b 58f\n" + "51:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 58f\n" + "st1 { v16.b }[12], [x28]\n" + "st1 { v20.b }[12], [x21]\n" + "b 58f\n" + "52:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 53f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[10], [x28]\n" + "st1 { v20.b }[10], [x21]\n" + "b 58f\n" + "53:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 58f\n" + "st1 { v16.b }[8], [x28]\n" + "st1 { v20.b }[8], [x21]\n" + "b 58f\n" + "54:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 56f\n" + "str s16, [x28], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz x11, #1, 55f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[6], [x28]\n" + "st1 { v20.b }[6], [x21]\n" + "b 58f\n" + "55:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 58f\n" + "st1 { v16.b }[4], [x28]\n" + "st1 { v20.b }[4], [x21]\n" + "b 58f\n" + "56:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 57f\n" + "str h16, [x28], #0x2\n" + "str h20, [x21], #0x2\n" + "tbz x11, #0, 58f\n" + "st1 { v16.b }[2], [x28]\n" + "st1 { v20.b }[2], [x21]\n" + "b 58f\n" + "57:" // Height 2: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "str b20, [x21, #0x0]\n" + "58:" // Height 2: Partial direct writeback: Done + "b 60f\n" + "59:" // Height 2: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "str q20, [x21, #0x0]\n" + "60:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 32b\n" + "b 122f\n" + "61:" // Height 3 + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[col_bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "62:" // Height 3: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "63:" // Height 3: setup done + "mov x27, #0x0\n" + "64:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 65f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x27, 66f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "b 66f\n" + "65:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x22, x25, x19\n" + "add x21, x22, x19\n" + "66:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 71f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "blt 69f\n" + "67:" // Height 3: Multiply loop: Main loop head + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "ldr x24, [x10, #0x18]\n" + "add x25, x25, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" + "ldr x23, [x10, #0x28]\n" + "add x21, x21, #0x10\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + "mov v7.d[1], x19\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "ldr x23, [x10, #0x48]\n" + "ldr d9, [x10, #0x50]\n" + "ldr x19, [x10, #0x58]\n" + "mov v8.d[1], x23\n" + "ldr d10, [x10, #0x60]\n" + "ldr x23, [x10, #0x68]\n" + "mov v9.d[1], x19\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + "mov v10.d[1], x23\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + "ldr d4, [x10, #0x70]\n" + "ldr x19, [x10, #0x78]\n" + "ldr d5, [x10, #0x80]\n" + "ldr x24, [x10, #0x88]\n" + "mov v4.d[1], x19\n" + "ldr d6, [x10, #0x90]\n" + "ldr x23, [x10, #0x98]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "mov v6.d[1], x23\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + "ldr d7, [x10, #0xa0]\n" + "ldr x19, [x10, #0xa8]\n" + "ldr d8, [x10, #0xb0]\n" + "ldr x23, [x10, #0xb8]\n" + "mov v7.d[1], x19\n" + "ldr d9, [x10, #0xc0]\n" + "ldr x19, [x10, #0xc8]\n" + "mov v8.d[1], x23\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + "mov v9.d[1], x19\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + "ldr d10, [x10, #0xd0]\n" + "ldr x23, [x10, #0xd8]\n" + "ldr d4, [x10, #0xe0]\n" + "ldr x19, [x10, #0xe8]\n" + "mov v10.d[1], x23\n" + "ldr d5, [x10, #0xf0]\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + "mov v4.d[1], x19\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "68:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "bge 67b\n" + "69:" // Height 3: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "ldr q6, [x10, #0x20]\n" + "sub x26, x26, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x25, x25, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + "ldr q9, [x10, #0x50]\n" + "add x22, x22, #0x10\n" + "ldr q10, [x10, #0x60]\n" + "add x21, x21, #0x10\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + "ldr q7, [x10, #0xa0]\n" + "ldr q8, [x10, #0xb0]\n" + "ldr q9, [x10, #0xc0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + "ldr q10, [x10, #0xd0]\n" + "ldr q4, [x10, #0xe0]\n" + "ldr q5, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 70f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "70:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "71:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 78f\n" + "cmp x26, #0x4\n" + "blt 74f\n" + "72:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "tbnz %x[flags], #31, 73f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "73:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + "bge 72b\n" + "cbz x26, 78f\n" + "74:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #1, 75f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "tbz x26, #0, 76f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" + "b 76f\n" + "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" + "76:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 77f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "77:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + "78:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 64b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x21, x28, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbnz %x[flags], #31, 79f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x22]\n" + "neg v3.4s, v3.4s\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "mul v11.4s, v11.4s, v3.4s\n" + "mul v12.4s, v12.4s, v3.4s\n" + "mul v13.4s, v13.4s, v3.4s\n" + "79:" // Height 3: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "add v20.4s, v20.4s, v12.4s\n" + "add v21.4s, v21.4s, v12.4s\n" + "add v22.4s, v22.4s, v12.4s\n" + "add v23.4s, v23.4s, v12.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "ldr q3, [x9, #0x30]\n" + "ld1r { v0.4s }, [x23]\n" + "add x9, x9, #0x40\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "tbz %x[flags], #5, 80f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "and v9.16b, v21.16b, v0.16b\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v23.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "and v5.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v7.16b, v26.16b, v0.16b\n" + "and v8.16b, v27.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "80:" // Height 3: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "bge 89f\n" + "tbz x11, #3, 84f\n" + "str d16, [x28], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x11, #2, 82f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x11, #1, 81f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[14], [x28]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 88f\n" + "81:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 88f\n" + "st1 { v16.b }[12], [x28]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 88f\n" + "82:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 83f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[10], [x28]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 88f\n" + "83:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 88f\n" + "st1 { v16.b }[8], [x28]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 88f\n" + "84:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 86f\n" + "str s16, [x28], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x11, #1, 85f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[6], [x28]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 88f\n" + "85:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 88f\n" + "st1 { v16.b }[4], [x28]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 88f\n" + "86:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 87f\n" + "str h16, [x28], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x11, #0, 88f\n" + "st1 { v16.b }[2], [x28]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 88f\n" + "87:" // Height 3: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "88:" // Height 3: Partial direct writeback: Done + "b 90f\n" + "89:" // Height 3: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "90:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 62b\n" + "b 122f\n" + "91:" // Height 4 + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.16b, #0x1\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[col_bias]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x28, %x[output_ptr]\n" + "mov x19, #0x4\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "92:" // Height 4: Column loop + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "93:" // Height 4: setup done + "mov x27, #0x0\n" + "94:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 95f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x25, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "ldr x20, [x20, #0x18]\n" + "cbnz x27, 96f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x25, x25, x19\n" + "add x22, x22, x19\n" + "add x21, x21, x19\n" + "add x20, x20, x19\n" + "b 96f\n" + "95:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x22, x25, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "96:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 101f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x21, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "blt 99f\n" + "97:" // Height 4: Multiply loop: Main loop head + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" + "ldr d5, [x10, #0x10]\n" + "add x25, x25, #0x10\n" + "ldr x24, [x10, #0x18]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x10, #0x20]\n" + "add x21, x21, #0x10\n" + "ldr x23, [x10, #0x28]\n" + "add x20, x20, #0x10\n" + "mov v5.d[1], x24\n" + "ldr d7, [x10, #0x30]\n" + "ldr x19, [x10, #0x38]\n" + "mov v6.d[1], x23\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + "mov v7.d[1], x19\n" + ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" + "ldr d8, [x10, #0x40]\n" + "ldr x23, [x10, #0x48]\n" + "ldr d9, [x10, #0x50]\n" + "ldr x19, [x10, #0x58]\n" + "mov v8.d[1], x23\n" + "ldr d10, [x10, #0x60]\n" + "ldr x23, [x10, #0x68]\n" + "mov v9.d[1], x19\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + "mov v10.d[1], x23\n" + ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" + "ldr d4, [x10, #0x70]\n" + "ldr x19, [x10, #0x78]\n" + "ldr d5, [x10, #0x80]\n" + "ldr x24, [x10, #0x88]\n" + "mov v4.d[1], x19\n" + "ldr d6, [x10, #0x90]\n" + "ldr x23, [x10, #0x98]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "mov v6.d[1], x23\n" + ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" + "ldr d7, [x10, #0xa0]\n" + "ldr x19, [x10, #0xa8]\n" + "ldr d8, [x10, #0xb0]\n" + "ldr x23, [x10, #0xb8]\n" + "mov v7.d[1], x19\n" + "ldr d9, [x10, #0xc0]\n" + "ldr x19, [x10, #0xc8]\n" + "mov v8.d[1], x23\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + "mov v9.d[1], x19\n" + ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n" + "ldr d10, [x10, #0xd0]\n" + "ldr x23, [x10, #0xd8]\n" + "ldr d4, [x10, #0xe0]\n" + "ldr x19, [x10, #0xe8]\n" + "mov v10.d[1], x23\n" + "ldr d5, [x10, #0xf0]\n" + "ldr x24, [x10, #0xf8]\n" + "add x10, x10, #0x100\n" + "mov v4.d[1], x19\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + "mov v5.d[1], x24\n" + ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 98f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "98:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q4, [x10, #0x0]\n" + "bge 97b\n" + "99:" // Height 4: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" + "ldr q5, [x10, #0x10]\n" + "sub x26, x26, #0x10\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" + "ldr q7, [x10, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" + "ldr q8, [x10, #0x40]\n" + "add x21, x21, #0x10\n" + "ldr q9, [x10, #0x50]\n" + "add x20, x20, #0x10\n" + "ldr q10, [x10, #0x60]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" + "ldr q4, [x10, #0x70]\n" + "ldr q5, [x10, #0x80]\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x10, #0xa0]\n" + "ldr q8, [x10, #0xb0]\n" + "ldr q9, [x10, #0xc0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n" + "ldr q10, [x10, #0xd0]\n" + "ldr q4, [x10, #0xe0]\n" + "ldr q5, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 100f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "100:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "101:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 108f\n" + "cmp x26, #0x4\n" + "blt 104f\n" + "102:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s3, [x20], #0x4\n" + "tbnz %x[flags], #31, 103f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "103:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x10, #0x0]\n" + "sub x26, x26, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x26, #0x4\n" + "ldr q8, [x10, #0x20]\n" + ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" + "ldr q9, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" + "bge 102b\n" + "cbz x26, 108f\n" + "104:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #1, 105f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "ldr h3, [x20], #0x2\n" + "tbz x26, #0, 106f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" + "ld1 { v3.b }[2], [x20]\n" + "b 106f\n" + "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" + "ldr b3, [x20, #0x0]\n" + "106:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 107f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "107:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x10, #0x0]\n" + "ldr q4, [x10, #0x10]\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" + "ldr q6, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n" + "108:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 94b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "add x21, x28, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "prfm pstl1keep, [x19, #0x0]\n" + "tbnz %x[flags], #31, 109f\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x22]\n" + "addp v11.4s, v11.4s, v11.4s\n" + "addp v12.4s, v12.4s, v12.4s\n" + "neg v4.4s, v4.4s\n" + "addp v13.4s, v13.4s, v13.4s\n" + "addp v14.4s, v14.4s, v14.4s\n" + "mul v11.4s, v11.4s, v4.4s\n" + "mul v12.4s, v12.4s, v4.4s\n" + "mul v13.4s, v13.4s, v4.4s\n" + "mul v14.4s, v14.4s, v4.4s\n" + "109:" // Height 4: skip row sum fixup + "add v16.4s, v16.4s, v11.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "add v19.4s, v19.4s, v11.4s\n" + "add v20.4s, v20.4s, v12.4s\n" + "add v21.4s, v21.4s, v12.4s\n" + "add v22.4s, v22.4s, v12.4s\n" + "add v23.4s, v23.4s, v12.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "add v25.4s, v25.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v27.4s, v27.4s, v13.4s\n" + "add v28.4s, v28.4s, v14.4s\n" + "add v29.4s, v29.4s, v14.4s\n" + "add v30.4s, v30.4s, v14.4s\n" + "add v31.4s, v31.4s, v14.4s\n" + "ldr q0, [x9, #0x0]\n" + "orr %x[flags], %x[flags], #0x80000000\n" + "ldr q1, [x9, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" + "ldr q2, [x9, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v17.4s, v17.4s, v1.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v18.4s, v18.4s, v2.4s\n" + "add v22.4s, v22.4s, v2.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v1.4s\n" + "add v26.4s, v26.4s, v2.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v2.4s\n" + "ldr q3, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v4.4s\n" + "sqrdmulh v18.4s, v18.4s, v4.4s\n" + "sqrdmulh v20.4s, v20.4s, v4.4s\n" + "sqrdmulh v21.4s, v21.4s, v4.4s\n" + "sqrdmulh v22.4s, v22.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "sqrdmulh v24.4s, v24.4s, v4.4s\n" + "sqrdmulh v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v26.4s, v26.4s, v4.4s\n" + "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "sqrdmulh v28.4s, v28.4s, v4.4s\n" + "sqrdmulh v29.4s, v29.4s, v4.4s\n" + "sqrdmulh v30.4s, v30.4s, v4.4s\n" + "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "tbz %x[flags], #5, 110f\n" + "and v4.16b, v16.16b, v0.16b\n" + "and v5.16b, v17.16b, v0.16b\n" + "and v6.16b, v18.16b, v0.16b\n" + "and v7.16b, v19.16b, v0.16b\n" + "and v8.16b, v20.16b, v0.16b\n" + "and v9.16b, v21.16b, v0.16b\n" + "and v10.16b, v22.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "and v4.16b, v23.16b, v0.16b\n" + "sqadd v17.4s, v17.4s, v5.4s\n" + "sqadd v18.4s, v18.4s, v6.4s\n" + "sqadd v19.4s, v19.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v10.4s\n" + "and v5.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v0.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v7.16b, v26.16b, v0.16b\n" + "and v8.16b, v27.16b, v0.16b\n" + "and v9.16b, v28.16b, v0.16b\n" + "and v10.16b, v29.16b, v0.16b\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "sshr v9.4s, v9.4s, #0x1f\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "and v4.16b, v30.16b, v0.16b\n" + "sqadd v24.4s, v24.4s, v5.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v7.4s\n" + "and v5.16b, v31.16b, v0.16b\n" + "sqadd v27.4s, v27.4s, v8.4s\n" + "sqadd v28.4s, v28.4s, v9.4s\n" + "sqadd v29.4s, v29.4s, v10.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v4.4s\n" + "sqadd v31.4s, v31.4s, v5.4s\n" + "110:" // Height 4: no shift correction + "srshl v16.4s, v16.4s, v0.4s\n" + "srshl v17.4s, v17.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v0.4s\n" + "srshl v19.4s, v19.4s, v0.4s\n" + "srshl v20.4s, v20.4s, v0.4s\n" + "srshl v21.4s, v21.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "srshl v24.4s, v24.4s, v0.4s\n" + "srshl v25.4s, v25.4s, v0.4s\n" + "srshl v26.4s, v26.4s, v0.4s\n" + "srshl v27.4s, v27.4s, v0.4s\n" + "srshl v28.4s, v28.4s, v0.4s\n" + "srshl v29.4s, v29.4s, v0.4s\n" + "srshl v30.4s, v30.4s, v0.4s\n" + "srshl v31.4s, v31.4s, v0.4s\n" + "add x22, %x[qp], %[c_offset]\n" + "add x23, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x23]\n" + "cmp x11, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v4.4s\n" + "add v19.4s, v19.4s, v4.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v4.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "smin v16.4s, v16.4s, v6.4s\n" + "smin v17.4s, v17.4s, v6.4s\n" + "smin v18.4s, v18.4s, v6.4s\n" + "smin v19.4s, v19.4s, v6.4s\n" + "smin v20.4s, v20.4s, v6.4s\n" + "smin v21.4s, v21.4s, v6.4s\n" + "smin v22.4s, v22.4s, v6.4s\n" + "smin v23.4s, v23.4s, v6.4s\n" + "smin v24.4s, v24.4s, v6.4s\n" + "smin v25.4s, v25.4s, v6.4s\n" + "smax v16.4s, v16.4s, v5.4s\n" + "smax v17.4s, v17.4s, v5.4s\n" + "smax v18.4s, v18.4s, v5.4s\n" + "smax v19.4s, v19.4s, v5.4s\n" + "smax v20.4s, v20.4s, v5.4s\n" + "smax v21.4s, v21.4s, v5.4s\n" + "smax v22.4s, v22.4s, v5.4s\n" + "smax v23.4s, v23.4s, v5.4s\n" + "smax v24.4s, v24.4s, v5.4s\n" + "smax v25.4s, v25.4s, v5.4s\n" + "add v26.4s, v26.4s, v4.4s\n" + "add v27.4s, v27.4s, v4.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v4.4s\n" + "add v31.4s, v31.4s, v4.4s\n" + "uzp1 v16.8h, v16.8h, v17.8h\n" + "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v20.8h, v20.8h, v21.8h\n" + "uzp1 v21.8h, v22.8h, v23.8h\n" + "smin v26.4s, v26.4s, v6.4s\n" + "smin v27.4s, v27.4s, v6.4s\n" + "smin v28.4s, v28.4s, v6.4s\n" + "smin v29.4s, v29.4s, v6.4s\n" + "smin v30.4s, v30.4s, v6.4s\n" + "smin v31.4s, v31.4s, v6.4s\n" + "uzp1 v24.8h, v24.8h, v25.8h\n" + "uzp1 v16.16b, v16.16b, v17.16b\n" + "uzp1 v20.16b, v20.16b, v21.16b\n" + "smax v26.4s, v26.4s, v5.4s\n" + "smax v27.4s, v27.4s, v5.4s\n" + "smax v28.4s, v28.4s, v5.4s\n" + "smax v29.4s, v29.4s, v5.4s\n" + "smax v30.4s, v30.4s, v5.4s\n" + "smax v31.4s, v31.4s, v5.4s\n" + "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v28.8h, v28.8h, v29.8h\n" + "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v28.16b, v28.16b, v29.16b\n" + "bge 119f\n" + "tbz x11, #3, 114f\n" + "str d16, [x28], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x11, #2, 112f\n" + "st1 { v16.s }[2], [x28], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v28.s }[2], [x19], #0x4\n" + "tbz x11, #1, 111f\n" + "st1 { v16.h }[6], [x28], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v28.h }[6], [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[14], [x28]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "st1 { v28.b }[14], [x19]\n" + "b 118f\n" + "111:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 118f\n" + "st1 { v16.b }[12], [x28]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "st1 { v28.b }[12], [x19]\n" + "b 118f\n" + "112:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 113f\n" + "st1 { v16.h }[4], [x28], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v28.h }[4], [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[10], [x28]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "st1 { v28.b }[10], [x19]\n" + "b 118f\n" + "113:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 118f\n" + "st1 { v16.b }[8], [x28]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "st1 { v28.b }[8], [x19]\n" + "b 118f\n" + "114:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 116f\n" + "str s16, [x28], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "str s28, [x19], #0x4\n" + "tbz x11, #1, 115f\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v28.h }[2], [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[6], [x28]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "st1 { v28.b }[6], [x19]\n" + "b 118f\n" + "115:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 118f\n" + "st1 { v16.b }[4], [x28]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "st1 { v28.b }[4], [x19]\n" + "b 118f\n" + "116:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 117f\n" + "str h16, [x28], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "str h28, [x19], #0x2\n" + "tbz x11, #0, 118f\n" + "st1 { v16.b }[2], [x28]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "st1 { v28.b }[2], [x19]\n" + "b 118f\n" + "117:" // Height 4: Partial direct writeback: partial_1_0 + "str b16, [x28, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "str b28, [x19, #0x0]\n" + "118:" // Height 4: Partial direct writeback: Done + "b 120f\n" + "119:" // Height 4: Full writeback + "str q16, [x28, #0x0]\n" + "add x28, x28, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "str q28, [x19, #0x0]\n" + "120:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 92b\n" + "subs %x[M], %x[M], #0x4\n" + "beq 122f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 121f\n" + "add x20, x20, #0x4\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "121:" // Update direct input + "mov x19, #0x4\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "122:" // Exit + + : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp index 2b80285f57..6e85eec204 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp @@ -81,216 +81,207 @@ void a64_hybrid_u8qa_dot_4x16 ( "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 94f\n" + "bge 91f\n" "cmp %x[M], #0x2\n" - "bgt 63f\n" - "beq 32f\n" + "bgt 61f\n" + "beq 31f\n" "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x10, %x[col_bias]\n" - "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "movi v14.4s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 2f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "add x9, x9, x19\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x9, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "4:" // Height 1: setup done - "mov x28, #0x0\n" - "5:" // Height 1: String loop + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 6f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "cbnz x28, 7f\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "b 7f\n" - "6:" // Height 1: setup direct input - "mov x26, %x[input_ptr]\n" - "7:" // Height 1: input setup done - "cmp x27, #0x10\n" - "blt 12f\n" - "cmp x27, #0x20\n" - "blt 10f\n" - "8:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "blt 11f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "cmp x24, #0x20\n" + "blt 9f\n" + "7:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" - "ldr q6, [x11, #0x20]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q8, [x11, #0x40]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x60]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q9, [x28, #0x50]\n" + "ldr q10, [x28, #0x60]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q4, [x28, #0x70]\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q5, [x11, #0x80]\n" - "ldr q6, [x11, #0x90]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q6, [x28, #0x90]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q9, [x28, #0xc0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "ldr q10, [x11, #0xd0]\n" - "ldr q4, [x11, #0xe0]\n" ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" - "add x26, x26, #0x10\n" + "ldr q4, [x28, #0xe0]\n" + "ldr q5, [x28, #0xf0]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "add x28, x28, #0x100\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + "tbnz %x[flags], #31, 8f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "ldr q0, [x23, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q4, [x28, #0x0]\n" + "bge 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x28, #0x30]\n" + "ldr q8, [x28, #0x40]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q10, [x28, #0x60]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q5, [x28, #0x80]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + "ldr q7, [x28, #0xa0]\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q10, [x28, #0xd0]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - "add x11, x11, #0x100\n" ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - "tbnz %x[flags], #31, 9f\n" + "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "9:" // Height 1: Multiply loop: unique 1: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "cmp x27, #0x20\n" - "bge 8b\n" - "10:" // Height 1: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "10:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "11:" // Height 1: Multiply loop: Main loop skip + "cbz x24, 18f\n" + "cmp x24, #0x4\n" + "blt 14f\n" + "12:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "tbnz %x[flags], #31, 13f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + "13:" // Height 1: Multiply loop: unique 3: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" - "ldr q8, [x11, #0x20]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x30]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x40]\n" - "ldr q4, [x11, #0x50]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x60]\n" - "ldr q6, [x11, #0x70]\n" - ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" - ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" - "ldr q7, [x11, #0x80]\n" - "ldr q8, [x11, #0x90]\n" - ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" - "ldr q5, [x11, #0xd0]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - "add x26, x26, #0x10\n" - ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" - "add x11, x11, #0x100\n" - ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" - "tbnz %x[flags], #31, 11f\n" + "bge 12b\n" + "cbz x24, 18f\n" + "14:" // Height 1: Multiply loop: Skip odd blocks + "tbz x24, #1, 15f\n" + "ldr h0, [x23], #0x2\n" + "tbz x24, #0, 16f\n" + "ld1 { v0.b }[2], [x23]\n" + "b 16f\n" + "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "16:" // Height 1: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 17f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "11:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "12:" // Height 1: Multiply loop: Main loop skip - "cbz x27, 19f\n" - "cmp x27, #0x4\n" - "blt 15f\n" - "13:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "tbnz %x[flags], #31, 14f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "14:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - "ldr q10, [x11, #0x20]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "ldr q4, [x11, #0x30]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "sub x27, x27, #0x4\n" - "add x11, x11, #0x40\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - "cmp x27, #0x4\n" - "bge 13b\n" - "cbz x27, 19f\n" - "15:" // Height 1: Multiply loop: Skip odd blocks - "tbz x27, #1, 16f\n" - "ldr h0, [x26], #0x2\n" - "tbz x27, #0, 17f\n" - "ld1 { v0.b }[2], [x26]\n" - "b 17f\n" - "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "17:" // Height 1: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 18f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - "18:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - "ldr q8, [x11, #0x30]\n" - ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" - "19:" // Height 1: Multiply loop: No odd multiplies + "17:" // Height 1: Multiply loop: unique 4: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + "ldr q6, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + "18:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 5b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbnz %x[flags], #31, 20f\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 4b\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbnz %x[flags], #31, 19f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" + "add x22, %x[qp], %[b_offset]\n" "addp v11.4s, v11.4s, v11.4s\n" - "ld1r { v1.4s }, [x19]\n" + "ld1r { v1.4s }, [x22]\n" "neg v1.4s, v1.4s\n" "mul v11.4s, v11.4s, v1.4s\n" - "20:" // Height 1: skip row sum fixup + "19:" // Height 1: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" + "ld1r { v4.4s }, [x22]\n" "add v17.4s, v17.4s, v1.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" "add v18.4s, v18.4s, v2.4s\n" - "ld1r { v4.4s }, [x19]\n" - "add x10, x10, #0x40\n" "add v19.4s, v19.4s, v3.4s\n" "sqrdmulh v16.4s, v16.4s, v4.4s\n" "sqrdmulh v17.4s, v17.4s, v4.4s\n" "sqrdmulh v18.4s, v18.4s, v4.4s\n" "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "tbz %x[flags], #5, 21f\n" + "tbz %x[flags], #5, 20f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -303,18 +294,18 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqadd v17.4s, v17.4s, v5.4s\n" "sqadd v18.4s, v18.4s, v6.4s\n" "sqadd v19.4s, v19.4s, v7.4s\n" - "21:" // Height 1: no shift correction + "20:" // Height 1: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "add v16.4s, v16.4s, v4.4s\n" "add v17.4s, v17.4s, v4.4s\n" "add v18.4s, v18.4s, v4.4s\n" @@ -330,80 +321,69 @@ void a64_hybrid_u8qa_dot_4x16 ( "smax v19.4s, v19.4s, v5.4s\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" - "bge 30f\n" - "tbz x12, #3, 25f\n" - "str d16, [x9], #0x8\n" - "tbz x12, #2, 23f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "tbz x12, #1, 22f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[14], [x9]\n" - "b 29f\n" - "22:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x12, #0, 29f\n" - "st1 { v16.b }[12], [x9]\n" - "b 29f\n" - "23:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x12, #1, 24f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[10], [x9]\n" - "b 29f\n" - "24:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x12, #0, 29f\n" - "st1 { v16.b }[8], [x9]\n" - "b 29f\n" - "25:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x12, #2, 27f\n" - "str s16, [x9], #0x4\n" - "tbz x12, #1, 26f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[6], [x9]\n" - "b 29f\n" - "26:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x12, #0, 29f\n" - "st1 { v16.b }[4], [x9]\n" - "b 29f\n" - "27:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x12, #1, 28f\n" - "str h16, [x9], #0x2\n" - "tbz x12, #0, 29f\n" - "st1 { v16.b }[2], [x9]\n" - "b 29f\n" - "28:" // Height 1: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "29:" // Height 1: Partial direct writeback: Done - "b 31f\n" - "30:" // Height 1: Full writeback - "str q16, [x9, #0x0]\n" - "add x9, x9, #0x10\n" - "31:" // Height 1: Writeback done - "subs x12, x12, #0x10\n" - "bgt 3b\n" - "b 126f\n" - "32:" // Height 2 + "bge 29f\n" + "tbz x9, #3, 24f\n" + "str d16, [x26], #0x8\n" + "tbz x9, #2, 22f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "tbz x9, #1, 21f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[14], [x26]\n" + "b 28f\n" + "21:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x9, #0, 28f\n" + "st1 { v16.b }[12], [x26]\n" + "b 28f\n" + "22:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x9, #1, 23f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[10], [x26]\n" + "b 28f\n" + "23:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x9, #0, 28f\n" + "st1 { v16.b }[8], [x26]\n" + "b 28f\n" + "24:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x9, #2, 26f\n" + "str s16, [x26], #0x4\n" + "tbz x9, #1, 25f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[6], [x26]\n" + "b 28f\n" + "25:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x9, #0, 28f\n" + "st1 { v16.b }[4], [x26]\n" + "b 28f\n" + "26:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x9, #1, 27f\n" + "str h16, [x26], #0x2\n" + "tbz x9, #0, 28f\n" + "st1 { v16.b }[2], [x26]\n" + "b 28f\n" + "27:" // Height 1: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "28:" // Height 1: Partial direct writeback: Done + "b 30f\n" + "29:" // Height 1: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "30:" // Height 1: Writeback done + "subs x9, x9, #0x10\n" + "bgt 2b\n" + "b 122f\n" + "31:" // Height 2 "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" - "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "movi v14.4s, #0x0\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 33f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "add x25, x25, x19\n" - "b 34f\n" - "33:" // Height 2: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "34:" // Height 2: Column loop + "mov x26, %x[output_ptr]\n" + "32:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -412,71 +392,135 @@ void a64_hybrid_u8qa_dot_4x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "35:" // Height 2: setup done - "mov x28, #0x0\n" - "36:" // Height 2: String loop + "33:" // Height 2: setup done + "mov x25, #0x0\n" + "34:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 37f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 35f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "cbnz x28, 38f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 36f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" - "b 38f\n" - "37:" // Height 2: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "38:" // Height 2: input setup done - "cmp x27, #0x10\n" - "blt 43f\n" - "cmp x27, #0x20\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 36f\n" + "35:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "36:" // Height 2: input setup done + "cmp x24, #0x10\n" "blt 41f\n" - "39:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q4, [x28, #0x0]\n" + "blt 39f\n" + "37:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x20]\n" - "ldr q7, [x11, #0x30]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q8, [x11, #0x40]\n" + "ldr q7, [x28, #0x30]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x60]\n" + "ldr q9, [x28, #0x50]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q10, [x28, #0x60]\n" + "ldr q4, [x28, #0x70]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x80]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x90]\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q6, [x28, #0x90]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "add x26, x26, #0x10\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + "ldr q10, [x28, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "tbnz %x[flags], #31, 38f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "38:" // Height 2: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "bge 37b\n" + "39:" // Height 2: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q8, [x28, #0x40]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "ldr q10, [x28, #0x60]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q9, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "add x24, x24, #0x10\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x11, #0xd0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x11, #0xe0]\n" + "ldr q4, [x28, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" @@ -493,171 +537,109 @@ void a64_hybrid_u8qa_dot_4x16 ( "tbnz %x[flags], #31, 40f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "40:" // Height 2: Multiply loop: unique 5: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x20\n" - "bge 39b\n" - "41:" // Height 2: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "40:" // Height 2: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "41:" // Height 2: Multiply loop: Main loop skip + "cbz x24, 48f\n" + "cmp x24, #0x4\n" + "blt 44f\n" + "42:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "ldr s1, [x22], #0x4\n" + "tbnz %x[flags], #31, 43f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + "43:" // Height 2: Multiply loop: unique 7: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x20]\n" - "ldr q9, [x11, #0x30]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x40]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - "ldr q4, [x11, #0x50]\n" ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x60]\n" ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x70]\n" ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x80]\n" ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x90]\n" - ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" - "add x26, x26, #0x10\n" - ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" - "add x24, x24, #0x10\n" - ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n" - "ldr q5, [x11, #0xd0]\n" - ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" - ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n" - "tbnz %x[flags], #31, 42f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "42:" // Height 2: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "43:" // Height 2: Multiply loop: Main loop skip - "cbz x27, 50f\n" - "cmp x27, #0x4\n" - "blt 46f\n" - "44:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x24], #0x4\n" - "tbnz %x[flags], #31, 45f\n" + "bge 42b\n" + "cbz x24, 48f\n" + "44:" // Height 2: Multiply loop: Skip odd blocks + "tbz x24, #1, 45f\n" + "ldr h0, [x23], #0x2\n" + "ldr h1, [x22], #0x2\n" + "tbz x24, #0, 46f\n" + "ld1 { v0.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x22]\n" + "b 46f\n" + "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "46:" // Height 2: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 47f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x20]\n" - "ldr q4, [x11, #0x30]\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "sub x27, x27, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "cmp x27, #0x4\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - "bge 44b\n" - "cbz x27, 50f\n" - "46:" // Height 2: Multiply loop: Skip odd blocks - "tbz x27, #1, 47f\n" - "ldr h0, [x26], #0x2\n" - "ldr h1, [x24], #0x2\n" - "tbz x27, #0, 48f\n" - "ld1 { v0.b }[2], [x26]\n" - "ld1 { v1.b }[2], [x24]\n" - "b 48f\n" - "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "ldr b1, [x24, #0x0]\n" - "48:" // Height 2: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 49f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - "49:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" - "ldr q7, [x11, #0x20]\n" - "ldr q8, [x11, #0x30]\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n" - "50:" // Height 2: Multiply loop: No odd multiplies + "47:" // Height 2: Multiply loop: unique 8: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + "add x28, x28, #0x40\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + "48:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 36b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "tbnz %x[flags], #31, 51f\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 34b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "tbnz %x[flags], #31, 49f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x19]\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v2.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "neg v2.4s, v2.4s\n" "mul v11.4s, v11.4s, v2.4s\n" "mul v12.4s, v12.4s, v2.4s\n" - "51:" // Height 2: skip row sum fixup + "49:" // Height 2: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v20.4s, v20.4s, v12.4s\n" - "ldr q3, [x10, #0x30]\n" + "ld1r { v4.4s }, [x22]\n" "add v21.4s, v21.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" "add v22.4s, v22.4s, v12.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" "add v23.4s, v23.4s, v12.4s\n" - "add x10, x10, #0x40\n" "add v16.4s, v16.4s, v0.4s\n" "add v17.4s, v17.4s, v1.4s\n" "add v18.4s, v18.4s, v2.4s\n" "add v19.4s, v19.4s, v3.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" "add v21.4s, v21.4s, v1.4s\n" "add v22.4s, v22.4s, v2.4s\n" "add v23.4s, v23.4s, v3.4s\n" @@ -669,7 +651,7 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v21.4s, v21.4s, v4.4s\n" "sqrdmulh v22.4s, v22.4s, v4.4s\n" "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "tbz %x[flags], #5, 52f\n" + "tbz %x[flags], #5, 50f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -694,18 +676,18 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqadd v21.4s, v21.4s, v9.4s\n" "sqadd v22.4s, v22.4s, v10.4s\n" "sqadd v23.4s, v23.4s, v4.4s\n" - "52:" // Height 2: no shift correction + "50:" // Height 2: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" @@ -740,100 +722,86 @@ void a64_hybrid_u8qa_dot_4x16 ( "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v21.8h, v22.8h, v23.8h\n" "uzp1 v20.16b, v20.16b, v21.16b\n" - "bge 61f\n" - "tbz x12, #3, 56f\n" - "str d16, [x9], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x12, #2, 54f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "tbz x12, #1, 53f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[14], [x9]\n" - "st1 { v20.b }[14], [x25]\n" - "b 60f\n" - "53:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x12, #0, 60f\n" - "st1 { v16.b }[12], [x9]\n" - "st1 { v20.b }[12], [x25]\n" - "b 60f\n" - "54:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x12, #1, 55f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[10], [x9]\n" - "st1 { v20.b }[10], [x25]\n" - "b 60f\n" - "55:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x12, #0, 60f\n" - "st1 { v16.b }[8], [x9]\n" - "st1 { v20.b }[8], [x25]\n" - "b 60f\n" - "56:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x12, #2, 58f\n" - "str s16, [x9], #0x4\n" - "str s20, [x25], #0x4\n" - "tbz x12, #1, 57f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[6], [x9]\n" - "st1 { v20.b }[6], [x25]\n" - "b 60f\n" - "57:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x12, #0, 60f\n" - "st1 { v16.b }[4], [x9]\n" - "st1 { v20.b }[4], [x25]\n" + "bge 59f\n" + "tbz x9, #3, 54f\n" + "str d16, [x26], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x9, #2, 52f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz x9, #1, 51f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x21]\n" + "b 58f\n" + "51:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x9, #0, 58f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x21]\n" + "b 58f\n" + "52:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x9, #1, 53f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x21]\n" + "b 58f\n" + "53:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x9, #0, 58f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x21]\n" + "b 58f\n" + "54:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x9, #2, 56f\n" + "str s16, [x26], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz x9, #1, 55f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x21]\n" + "b 58f\n" + "55:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x9, #0, 58f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x21]\n" + "b 58f\n" + "56:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x9, #1, 57f\n" + "str h16, [x26], #0x2\n" + "str h20, [x21], #0x2\n" + "tbz x9, #0, 58f\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x21]\n" + "b 58f\n" + "57:" // Height 2: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "str b20, [x21, #0x0]\n" + "58:" // Height 2: Partial direct writeback: Done "b 60f\n" - "58:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x12, #1, 59f\n" - "str h16, [x9], #0x2\n" - "str h20, [x25], #0x2\n" - "tbz x12, #0, 60f\n" - "st1 { v16.b }[2], [x9]\n" - "st1 { v20.b }[2], [x25]\n" - "b 60f\n" - "59:" // Height 2: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "str b20, [x25, #0x0]\n" - "60:" // Height 2: Partial direct writeback: Done - "b 62f\n" - "61:" // Height 2: Full writeback - "str q16, [x9, #0x0]\n" - "str q20, [x25, #0x0]\n" - "add x9, x9, #0x10\n" - "add x25, x25, #0x10\n" - "62:" // Height 2: Writeback done - "subs x12, x12, #0x10\n" - "bgt 34b\n" - "b 126f\n" - "63:" // Height 3 + "59:" // Height 2: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x21, #0x0]\n" + "60:" // Height 2: Writeback done + "subs x9, x9, #0x10\n" + "bgt 32b\n" + "b 122f\n" + "61:" // Height 3 "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "movi v14.4s, #0x0\n" + "mov x26, %x[output_ptr]\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 64f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "b 65f\n" - "64:" // Height 3: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "65:" // Height 3: Column loop + "62:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -846,85 +814,169 @@ void a64_hybrid_u8qa_dot_4x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "66:" // Height 3: setup done - "mov x28, #0x0\n" - "67:" // Height 3: String loop + "63:" // Height 3: setup done + "mov x25, #0x0\n" + "64:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 68f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 65f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" - "cbnz x28, 69f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 66f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 69f\n" - "68:" // Height 3: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "69:" // Height 3: input setup done - "cmp x27, #0x10\n" - "blt 74f\n" - "cmp x27, #0x20\n" - "blt 72f\n" - "70:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "add x21, x21, x19\n" + "b 66f\n" + "65:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "66:" // Height 3: input setup done + "cmp x24, #0x10\n" + "blt 71f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "blt 69f\n" + "67:" // Height 3: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q7, [x11, #0x30]\n" - "ldr q8, [x11, #0x40]\n" + "ldr q7, [x28, #0x30]\n" + "add x21, x21, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x60]\n" + "ldr q9, [x28, #0x50]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q10, [x28, #0x60]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x80]\n" + "ldr q4, [x28, #0x70]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x11, #0x90]\n" + "ldr q6, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + "ldr q10, [x28, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "tbnz %x[flags], #31, 68f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "68:" // Height 3: Multiply loop: unique 9: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "bge 67b\n" + "69:" // Height 3: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x28, #0x30]\n" "add x22, x22, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q8, [x28, #0x40]\n" + "add x21, x21, #0x10\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + "ldr q10, [x28, #0x60]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q9, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x11, #0xd0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x11, #0xe0]\n" + "ldr q4, [x28, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" @@ -945,181 +997,100 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 71f\n" + "tbnz %x[flags], #31, 70f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "71:" // Height 3: Multiply loop: unique 9: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x20\n" + "70:" // Height 3: Multiply loop: unique 10: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - "bge 70b\n" - "72:" // Height 3: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "71:" // Height 3: Multiply loop: Main loop skip + "cbz x24, 78f\n" + "cmp x24, #0x4\n" + "blt 74f\n" + "72:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "tbnz %x[flags], #31, 73f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + "73:" // Height 3: Multiply loop: unique 11: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x20]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" - "ldr q9, [x11, #0x30]\n" - "ldr q10, [x11, #0x40]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q4, [x11, #0x50]\n" ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - "ldr q5, [x11, #0x60]\n" ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" - "ldr q6, [x11, #0x70]\n" ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x80]\n" ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" - "ldr q8, [x11, #0x90]\n" ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" - ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n" - ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n" - "ldr q5, [x11, #0xd0]\n" - ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" - ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n" - ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n" - "tbnz %x[flags], #31, 73f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "73:" // Height 3: Multiply loop: unique 10: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "74:" // Height 3: Multiply loop: Main loop skip - "cbz x27, 81f\n" - "cmp x27, #0x4\n" - "blt 77f\n" - "75:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x24], #0x4\n" - "ldr s2, [x22], #0x4\n" - "tbnz %x[flags], #31, 76f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "76:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x20]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q4, [x11, #0x30]\n" - "sub x27, x27, #0x4\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "cmp x27, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" - "bge 75b\n" - "cbz x27, 81f\n" - "77:" // Height 3: Multiply loop: Skip odd blocks - "tbz x27, #1, 78f\n" - "ldr h0, [x26], #0x2\n" - "ldr h1, [x24], #0x2\n" - "ldr h2, [x22], #0x2\n" - "tbz x27, #0, 79f\n" - "ld1 { v0.b }[2], [x26]\n" - "ld1 { v1.b }[2], [x24]\n" - "ld1 { v2.b }[2], [x22]\n" - "b 79f\n" - "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "ldr b1, [x24, #0x0]\n" - "ldr b2, [x22, #0x0]\n" - "79:" // Height 3: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 80f\n" + "bge 72b\n" + "cbz x24, 78f\n" + "74:" // Height 3: Multiply loop: Skip odd blocks + "tbz x24, #1, 75f\n" + "ldr h0, [x23], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "tbz x24, #0, 76f\n" + "ld1 { v0.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" + "b 76f\n" + "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" + "76:" // Height 3: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 77f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - "80:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n" - "ldr q8, [x11, #0x30]\n" - "add x11, x11, #0x40\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n" - "81:" // Height 3: Multiply loop: No odd multiplies + "77:" // Height 3: Multiply loop: unique 12: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + "ldr q6, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + "78:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 67b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" - "tbnz %x[flags], #31, 82f\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 64b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "tbnz %x[flags], #31, 79f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x19]\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v3.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v11.4s, v11.4s, v11.4s\n" @@ -1129,24 +1100,24 @@ void a64_hybrid_u8qa_dot_4x16 ( "mul v11.4s, v11.4s, v3.4s\n" "mul v12.4s, v12.4s, v3.4s\n" "mul v13.4s, v13.4s, v3.4s\n" - "82:" // Height 3: skip row sum fixup + "79:" // Height 3: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v20.4s, v20.4s, v12.4s\n" - "ldr q3, [x10, #0x30]\n" + "ld1r { v4.4s }, [x22]\n" "add v21.4s, v21.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" "add v22.4s, v22.4s, v12.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" "add v23.4s, v23.4s, v12.4s\n" - "add x10, x10, #0x40\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" @@ -1160,7 +1131,7 @@ void a64_hybrid_u8qa_dot_4x16 ( "add v22.4s, v22.4s, v2.4s\n" "add v23.4s, v23.4s, v3.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" "add v25.4s, v25.4s, v1.4s\n" "add v26.4s, v26.4s, v2.4s\n" "add v27.4s, v27.4s, v3.4s\n" @@ -1176,7 +1147,7 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v25.4s, v25.4s, v4.4s\n" "sqrdmulh v26.4s, v26.4s, v4.4s\n" "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "tbz %x[flags], #5, 83f\n" + "tbz %x[flags], #5, 80f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -1213,18 +1184,18 @@ void a64_hybrid_u8qa_dot_4x16 ( "sshr v8.4s, v8.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v7.4s\n" "sqadd v27.4s, v27.4s, v8.4s\n" - "83:" // Height 3: no shift correction + "80:" // Height 3: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" @@ -1278,122 +1249,106 @@ void a64_hybrid_u8qa_dot_4x16 ( "uzp1 v16.16b, v16.16b, v17.16b\n" "uzp1 v20.16b, v20.16b, v21.16b\n" "uzp1 v24.16b, v24.16b, v25.16b\n" - "bge 92f\n" - "tbz x12, #3, 87f\n" - "str d16, [x9], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x12, #2, 85f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "tbz x12, #1, 84f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[14], [x9]\n" - "st1 { v20.b }[14], [x25]\n" - "st1 { v24.b }[14], [x23]\n" - "b 91f\n" - "84:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x12, #0, 91f\n" - "st1 { v16.b }[12], [x9]\n" - "st1 { v20.b }[12], [x25]\n" - "st1 { v24.b }[12], [x23]\n" - "b 91f\n" - "85:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x12, #1, 86f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[10], [x9]\n" - "st1 { v20.b }[10], [x25]\n" - "st1 { v24.b }[10], [x23]\n" - "b 91f\n" - "86:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x12, #0, 91f\n" - "st1 { v16.b }[8], [x9]\n" - "st1 { v20.b }[8], [x25]\n" - "st1 { v24.b }[8], [x23]\n" - "b 91f\n" - "87:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x12, #2, 89f\n" - "str s16, [x9], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "tbz x12, #1, 88f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[6], [x9]\n" - "st1 { v20.b }[6], [x25]\n" - "st1 { v24.b }[6], [x23]\n" - "b 91f\n" - "88:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x12, #0, 91f\n" - "st1 { v16.b }[4], [x9]\n" - "st1 { v20.b }[4], [x25]\n" - "st1 { v24.b }[4], [x23]\n" - "b 91f\n" - "89:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x12, #1, 90f\n" - "str h16, [x9], #0x2\n" - "str h20, [x25], #0x2\n" - "str h24, [x23], #0x2\n" - "tbz x12, #0, 91f\n" - "st1 { v16.b }[2], [x9]\n" - "st1 { v20.b }[2], [x25]\n" - "st1 { v24.b }[2], [x23]\n" - "b 91f\n" - "90:" // Height 3: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "str b20, [x25, #0x0]\n" - "str b24, [x23, #0x0]\n" - "91:" // Height 3: Partial direct writeback: Done - "b 93f\n" - "92:" // Height 3: Full writeback - "str q16, [x9, #0x0]\n" - "str q20, [x25, #0x0]\n" - "str q24, [x23, #0x0]\n" - "add x9, x9, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "93:" // Height 3: Writeback done - "subs x12, x12, #0x10\n" - "bgt 65b\n" - "b 126f\n" - "94:" // Height 4 + "bge 89f\n" + "tbz x9, #3, 84f\n" + "str d16, [x26], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x9, #2, 82f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "tbz x9, #1, 81f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "b 88f\n" + "81:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x9, #0, 88f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "b 88f\n" + "82:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x9, #1, 83f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "b 88f\n" + "83:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x9, #0, 88f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "b 88f\n" + "84:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x9, #2, 86f\n" + "str s16, [x26], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "tbz x9, #1, 85f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "b 88f\n" + "85:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x9, #0, 88f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "b 88f\n" + "86:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x9, #1, 87f\n" + "str h16, [x26], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "tbz x9, #0, 88f\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "b 88f\n" + "87:" // Height 3: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "88:" // Height 3: Partial direct writeback: Done + "b 90f\n" + "89:" // Height 3: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "90:" // Height 3: Writeback done + "subs x9, x9, #0x10\n" + "bgt 62b\n" + "b 122f\n" + "91:" // Height 4 "movi v11.4s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "movi v12.4s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" "movi v14.4s, #0x0\n" + "mov x19, #0x4\n" "movi v15.16b, #0x1\n" - "tbz %x[flags], #2, 95f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "ldr x21, [%x[output_ptr], #0x18]\n" - "add x25, x25, x19\n" - "add %x[output_ptr], %x[output_ptr], #0x20\n" - "add x23, x23, x19\n" - "add x21, x21, x19\n" - "b 96f\n" - "95:" // Height 4: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "add x21, x23, x19\n" - "add %x[output_ptr], x21, x19\n" - "96:" // Height 4: Column loop + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "92:" // Height 4: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" @@ -1410,99 +1365,99 @@ void a64_hybrid_u8qa_dot_4x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "97:" // Height 4: setup done - "mov x28, #0x0\n" - "98:" // Height 4: String loop + "93:" // Height 4: setup done + "mov x25, #0x0\n" + "94:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 99f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 95f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" "ldr x20, [x20, #0x18]\n" - "cbnz x28, 100f\n" + "cbnz x25, 96f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 100f\n" - "99:" // Height 4: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "100:" // Height 4: input setup done - "cmp x27, #0x10\n" - "blt 105f\n" - "cmp x27, #0x20\n" - "blt 103f\n" - "101:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" + "b 96f\n" + "95:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "96:" // Height 4: input setup done + "cmp x24, #0x10\n" + "blt 101f\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "cmp x24, #0x20\n" + "ldr q2, [x21, #0x0]\n" "ldr q3, [x20, #0x0]\n" - "ldr q4, [x11, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "blt 99f\n" + "97:" // Height 4: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x10]\n" + "ldr q5, [x28, #0x10]\n" + "add x23, x23, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x20]\n" + "ldr q6, [x28, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q7, [x11, #0x30]\n" + "ldr q7, [x28, #0x30]\n" + "add x21, x21, #0x10\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q8, [x11, #0x40]\n" - "ldr q9, [x11, #0x50]\n" + "ldr q8, [x28, #0x40]\n" + "add x20, x20, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q10, [x11, #0x60]\n" + "ldr q9, [x28, #0x50]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q4, [x11, #0x70]\n" + "ldr q10, [x28, #0x60]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "add x26, x26, #0x10\n" + "ldr q4, [x28, #0x70]\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x11, #0x80]\n" + "ldr q5, [x28, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "add x20, x20, #0x10\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x11, #0x90]\n" + "ldr q6, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x11, #0xa0]\n" + "ldr q7, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x11, #0xb0]\n" + "ldr q8, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x11, #0xc0]\n" + "ldr q9, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x11, #0xd0]\n" + "ldr q10, [x28, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x11, #0xe0]\n" + "ldr q4, [x28, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x11, #0xf0]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" @@ -1530,218 +1485,222 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 102f\n" + "tbnz %x[flags], #31, 98f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "102:" // Height 4: Multiply loop: unique 13: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x20\n" + "98:" // Height 4: Multiply loop: unique 13: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x20\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" - "bge 101b\n" - "103:" // Height 4: Multiply loop: Single iteration only - "sub x27, x27, #0x10\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x22, #0x0]\n" + "ldr q0, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q2, [x21, #0x0]\n" "ldr q3, [x20, #0x0]\n" - "ldr q6, [x11, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "bge 97b\n" + "99:" // Height 4: Multiply loop: Single iteration only + ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" + "ldr q5, [x28, #0x10]\n" + "sub x24, x24, #0x10\n" + ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" + "ldr q6, [x28, #0x20]\n" + "add x23, x23, #0x10\n" + ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" + "ldr q7, [x28, #0x30]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" + "ldr q8, [x28, #0x40]\n" + "add x21, x21, #0x10\n" + ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" + "ldr q9, [x28, #0x50]\n" + "add x20, x20, #0x10\n" + ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" + "ldr q10, [x28, #0x60]\n" + ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" + "ldr q4, [x28, #0x70]\n" + ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" + "ldr q6, [x28, #0x90]\n" + ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" + "ldr q7, [x28, #0xa0]\n" + ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" + ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" + ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" + "ldr q8, [x28, #0xb0]\n" + ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" + ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" + "ldr q9, [x28, #0xc0]\n" + ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" + ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" + ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" + "ldr q10, [x28, #0xd0]\n" + ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" + ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" + ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" + ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" + "ldr q4, [x28, #0xe0]\n" + ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" + "ldr q5, [x28, #0xf0]\n" + "add x28, x28, #0x100\n" + ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n" + ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" + ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" + ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n" + ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" + ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" + ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n" + ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" + ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" + ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" + ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n" + ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n" + "tbnz %x[flags], #31, 100f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "100:" // Height 4: Multiply loop: unique 14: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "101:" // Height 4: Multiply loop: Main loop skip + "cbz x24, 108f\n" + "cmp x24, #0x4\n" + "blt 104f\n" + "102:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x23], #0x4\n" + "ldr s1, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s3, [x20], #0x4\n" + "tbnz %x[flags], #31, 103f\n" + ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" + ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" + ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" + ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" + "103:" // Height 4: Multiply loop: unique 15: skip row sum + "ldr q6, [x28, #0x0]\n" ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x28, #0x10]\n" + "sub x24, x24, #0x4\n" ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - "ldr q8, [x11, #0x20]\n" + "ldr q8, [x28, #0x20]\n" + "cmp x24, #0x4\n" ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" - "ldr q9, [x11, #0x30]\n" + "ldr q9, [x28, #0x30]\n" + "add x28, x28, #0x40\n" ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" - "ldr q10, [x11, #0x40]\n" - "ldr q4, [x11, #0x50]\n" ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - "ldr q5, [x11, #0x60]\n" ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - "ldr q6, [x11, #0x70]\n" ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" - "add x26, x26, #0x10\n" ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x11, #0x80]\n" ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" - "add x20, x20, #0x10\n" ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" - "ldr q8, [x11, #0x90]\n" ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" - "ldr q9, [x11, #0xa0]\n" - ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n" - ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n" - ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n" - ".inst 0x6fa3e15c // udot v28.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x11, #0xb0]\n" - ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n" - ".inst 0x6fa3e09d // udot v29.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x11, #0xc0]\n" - ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n" - ".inst 0x6fa3e0be // udot v30.4s, v5.16b, v3.4b[1]\n" - "ldr q5, [x11, #0xd0]\n" - ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0df // udot v31.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x11, #0xe0]\n" - ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8fc // udot v28.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x11, #0xf0]\n" - ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n" - "add x11, x11, #0x100\n" - ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6f83e91d // udot v29.4s, v8.16b, v3.4b[2]\n" - ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n" - ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n" - ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n" - ".inst 0x6f83e93e // udot v30.4s, v9.16b, v3.4b[2]\n" - ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n" - ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n" - ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n" - ".inst 0x6f83e95f // udot v31.4s, v10.16b, v3.4b[2]\n" - ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6fa3e89c // udot v28.4s, v4.16b, v3.4b[3]\n" - ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n" - ".inst 0x6fa3e8bd // udot v29.4s, v5.16b, v3.4b[3]\n" - ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8de // udot v30.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8ff // udot v31.4s, v7.16b, v3.4b[3]\n" - "tbnz %x[flags], #31, 104f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "104:" // Height 4: Multiply loop: unique 14: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x20, #0x80]\n" - "105:" // Height 4: Multiply loop: Main loop skip - "cbz x27, 112f\n" - "cmp x27, #0x4\n" - "blt 108f\n" - "106:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x24], #0x4\n" - "ldr s2, [x22], #0x4\n" - "ldr s3, [x20], #0x4\n" - "tbnz %x[flags], #31, 107f\n" - ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" - ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" - ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" - ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "107:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q8, [x11, #0x0]\n" - ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" - "ldr q9, [x11, #0x10]\n" - ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" - "ldr q10, [x11, #0x20]\n" - ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" - "ldr q4, [x11, #0x30]\n" - ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" - "sub x27, x27, #0x4\n" - "add x11, x11, #0x40\n" - ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" - "cmp x27, #0x4\n" - ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" - ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" - ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" - "bge 106b\n" - "cbz x27, 112f\n" - "108:" // Height 4: Multiply loop: Skip odd blocks - "tbz x27, #1, 109f\n" - "ldr h0, [x26], #0x2\n" - "ldr h1, [x24], #0x2\n" - "ldr h2, [x22], #0x2\n" + "bge 102b\n" + "cbz x24, 108f\n" + "104:" // Height 4: Multiply loop: Skip odd blocks + "tbz x24, #1, 105f\n" + "ldr h0, [x23], #0x2\n" + "ldr h1, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" "ldr h3, [x20], #0x2\n" - "tbz x27, #0, 110f\n" - "ld1 { v0.b }[2], [x26]\n" - "ld1 { v1.b }[2], [x24]\n" - "ld1 { v2.b }[2], [x22]\n" + "tbz x24, #0, 106f\n" + "ld1 { v0.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x22]\n" + "ld1 { v2.b }[2], [x21]\n" "ld1 { v3.b }[2], [x20]\n" - "b 110f\n" - "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x26, #0x0]\n" - "ldr b1, [x24, #0x0]\n" - "ldr b2, [x22, #0x0]\n" + "b 106f\n" + "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x23, #0x0]\n" + "ldr b1, [x22, #0x0]\n" + "ldr b2, [x21, #0x0]\n" "ldr b3, [x20, #0x0]\n" - "110:" // Height 4: Multiply loop: Ragged operand read: Done - "tbnz %x[flags], #31, 111f\n" + "106:" // Height 4: Multiply loop: Ragged operand read: Done + "tbnz %x[flags], #31, 107f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" - "111:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q5, [x11, #0x0]\n" - ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n" - "ldr q8, [x11, #0x30]\n" - ".inst 0x6f83e0bc // udot v28.4s, v5.16b, v3.4b[0]\n" - "add x11, x11, #0x40\n" - ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fe // udot v30.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f83e11f // udot v31.4s, v8.16b, v3.4b[0]\n" - "112:" // Height 4: Multiply loop: No odd multiplies + "107:" // Height 4: Multiply loop: unique 16: skip row sum + "ldr q10, [x28, #0x0]\n" + ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" + "ldr q6, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n" + ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n" + "108:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x19\n" - "bne 98b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" + "add x25, x25, #0x1\n" + "cmp x25, x19\n" + "bne 94b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x21, x26, x19\n" "prfm pstl1keep, [x21, #0x0]\n" - "tbnz %x[flags], #31, 113f\n" + "add x20, x21, x19\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19\n" + "prfm pstl1keep, [x19, #0x0]\n" + "tbnz %x[flags], #31, 109f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x19, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[b_offset]\n" + "ld1r { v4.4s }, [x22]\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" @@ -1754,24 +1713,24 @@ void a64_hybrid_u8qa_dot_4x16 ( "mul v12.4s, v12.4s, v4.4s\n" "mul v13.4s, v13.4s, v4.4s\n" "mul v14.4s, v14.4s, v4.4s\n" - "113:" // Height 4: skip row sum fixup + "109:" // Height 4: skip row sum fixup "add v16.4s, v16.4s, v11.4s\n" + "ldr q0, [x27, #0x0]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q0, [x10, #0x0]\n" + "ldr q1, [x27, #0x10]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q1, [x10, #0x10]\n" + "ldr q2, [x27, #0x20]\n" + "add x22, %x[qp], %[per_layer_mul]\n" "add v19.4s, v19.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" + "ldr q3, [x27, #0x30]\n" + "add x27, x27, #0x40\n" "add v20.4s, v20.4s, v12.4s\n" - "ldr q3, [x10, #0x30]\n" + "ld1r { v4.4s }, [x22]\n" "add v21.4s, v21.4s, v12.4s\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" "add v22.4s, v22.4s, v12.4s\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x19]\n" "add v23.4s, v23.4s, v12.4s\n" - "add x10, x10, #0x40\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" @@ -1793,7 +1752,7 @@ void a64_hybrid_u8qa_dot_4x16 ( "add v26.4s, v26.4s, v2.4s\n" "add v27.4s, v27.4s, v3.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x20]\n" + "ld1r { v0.4s }, [x23]\n" "add v29.4s, v29.4s, v1.4s\n" "add v30.4s, v30.4s, v2.4s\n" "add v31.4s, v31.4s, v3.4s\n" @@ -1813,7 +1772,7 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqrdmulh v29.4s, v29.4s, v4.4s\n" "sqrdmulh v30.4s, v30.4s, v4.4s\n" "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "tbz %x[flags], #5, 114f\n" + "tbz %x[flags], #5, 110f\n" "and v4.16b, v16.16b, v0.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" "and v5.16b, v17.16b, v0.16b\n" @@ -1862,18 +1821,18 @@ void a64_hybrid_u8qa_dot_4x16 ( "sqadd v29.4s, v29.4s, v10.4s\n" "sqadd v30.4s, v30.4s, v4.4s\n" "sqadd v31.4s, v31.4s, v5.4s\n" - "114:" // Height 4: no shift correction + "110:" // Height 4: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x19]\n" + "add x22, %x[qp], %[c_offset]\n" + "ld1r { v4.4s }, [x22]\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x19, %x[qp], %[minval]\n" + "add x22, %x[qp], %[minval]\n" "srshl v18.4s, v18.4s, v0.4s\n" - "ld1r { v5.4s }, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x22]\n" + "add x22, %x[qp], %[maxval]\n" "srshl v19.4s, v19.4s, v0.4s\n" - "ld1r { v6.4s }, [x19]\n" - "cmp x12, #0x10\n" + "ld1r { v6.4s }, [x22]\n" + "cmp x9, #0x10\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" @@ -1946,125 +1905,122 @@ void a64_hybrid_u8qa_dot_4x16 ( "uzp1 v20.16b, v20.16b, v21.16b\n" "uzp1 v24.16b, v24.16b, v25.16b\n" "uzp1 v28.16b, v28.16b, v29.16b\n" - "bge 123f\n" - "tbz x12, #3, 118f\n" - "str d16, [x9], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x12, #2, 116f\n" - "st1 { v16.s }[2], [x9], #0x4\n" - "st1 { v20.s }[2], [x25], #0x4\n" - "st1 { v24.s }[2], [x23], #0x4\n" - "st1 { v28.s }[2], [x21], #0x4\n" - "tbz x12, #1, 115f\n" - "st1 { v16.h }[6], [x9], #0x2\n" - "st1 { v20.h }[6], [x25], #0x2\n" - "st1 { v24.h }[6], [x23], #0x2\n" - "st1 { v28.h }[6], [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[14], [x9]\n" - "st1 { v20.b }[14], [x25]\n" - "st1 { v24.b }[14], [x23]\n" - "st1 { v28.b }[14], [x21]\n" - "b 122f\n" - "115:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x12, #0, 122f\n" - "st1 { v16.b }[12], [x9]\n" - "st1 { v20.b }[12], [x25]\n" - "st1 { v24.b }[12], [x23]\n" - "st1 { v28.b }[12], [x21]\n" - "b 122f\n" - "116:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x12, #1, 117f\n" - "st1 { v16.h }[4], [x9], #0x2\n" - "st1 { v20.h }[4], [x25], #0x2\n" - "st1 { v24.h }[4], [x23], #0x2\n" - "st1 { v28.h }[4], [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[10], [x9]\n" - "st1 { v20.b }[10], [x25]\n" - "st1 { v24.b }[10], [x23]\n" - "st1 { v28.b }[10], [x21]\n" - "b 122f\n" - "117:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x12, #0, 122f\n" - "st1 { v16.b }[8], [x9]\n" - "st1 { v20.b }[8], [x25]\n" - "st1 { v24.b }[8], [x23]\n" - "st1 { v28.b }[8], [x21]\n" - "b 122f\n" - "118:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x12, #2, 120f\n" - "str s16, [x9], #0x4\n" - "str s20, [x25], #0x4\n" - "str s24, [x23], #0x4\n" - "str s28, [x21], #0x4\n" - "tbz x12, #1, 119f\n" - "st1 { v16.h }[2], [x9], #0x2\n" - "st1 { v20.h }[2], [x25], #0x2\n" - "st1 { v24.h }[2], [x23], #0x2\n" - "st1 { v28.h }[2], [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[6], [x9]\n" - "st1 { v20.b }[6], [x25]\n" - "st1 { v24.b }[6], [x23]\n" - "st1 { v28.b }[6], [x21]\n" - "b 122f\n" - "119:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x12, #0, 122f\n" - "st1 { v16.b }[4], [x9]\n" - "st1 { v20.b }[4], [x25]\n" - "st1 { v24.b }[4], [x23]\n" - "st1 { v28.b }[4], [x21]\n" - "b 122f\n" - "120:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x12, #1, 121f\n" - "str h16, [x9], #0x2\n" - "str h20, [x25], #0x2\n" - "str h24, [x23], #0x2\n" - "str h28, [x21], #0x2\n" - "tbz x12, #0, 122f\n" - "st1 { v16.b }[2], [x9]\n" - "st1 { v20.b }[2], [x25]\n" - "st1 { v24.b }[2], [x23]\n" - "st1 { v28.b }[2], [x21]\n" - "b 122f\n" - "121:" // Height 4: Partial direct writeback: partial_1_0 - "str b16, [x9, #0x0]\n" - "str b20, [x25, #0x0]\n" - "str b24, [x23, #0x0]\n" - "str b28, [x21, #0x0]\n" - "122:" // Height 4: Partial direct writeback: Done - "b 124f\n" - "123:" // Height 4: Full writeback - "str q16, [x9, #0x0]\n" - "str q20, [x25, #0x0]\n" - "str q24, [x23, #0x0]\n" - "str q28, [x21, #0x0]\n" - "add x9, x9, #0x10\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" - "add x21, x21, #0x10\n" - "124:" // Height 4: Writeback done - "subs x12, x12, #0x10\n" - "bgt 96b\n" + "bge 119f\n" + "tbz x9, #3, 114f\n" + "str d16, [x26], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x9, #2, 112f\n" + "st1 { v16.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v28.s }[2], [x19], #0x4\n" + "tbz x9, #1, 111f\n" + "st1 { v16.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v28.h }[6], [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[14], [x26]\n" + "st1 { v20.b }[14], [x21]\n" + "st1 { v24.b }[14], [x20]\n" + "st1 { v28.b }[14], [x19]\n" + "b 118f\n" + "111:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x9, #0, 118f\n" + "st1 { v16.b }[12], [x26]\n" + "st1 { v20.b }[12], [x21]\n" + "st1 { v24.b }[12], [x20]\n" + "st1 { v28.b }[12], [x19]\n" + "b 118f\n" + "112:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x9, #1, 113f\n" + "st1 { v16.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v28.h }[4], [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[10], [x26]\n" + "st1 { v20.b }[10], [x21]\n" + "st1 { v24.b }[10], [x20]\n" + "st1 { v28.b }[10], [x19]\n" + "b 118f\n" + "113:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x9, #0, 118f\n" + "st1 { v16.b }[8], [x26]\n" + "st1 { v20.b }[8], [x21]\n" + "st1 { v24.b }[8], [x20]\n" + "st1 { v28.b }[8], [x19]\n" + "b 118f\n" + "114:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x9, #2, 116f\n" + "str s16, [x26], #0x4\n" + "str s20, [x21], #0x4\n" + "str s24, [x20], #0x4\n" + "str s28, [x19], #0x4\n" + "tbz x9, #1, 115f\n" + "st1 { v16.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v28.h }[2], [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[6], [x26]\n" + "st1 { v20.b }[6], [x21]\n" + "st1 { v24.b }[6], [x20]\n" + "st1 { v28.b }[6], [x19]\n" + "b 118f\n" + "115:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x9, #0, 118f\n" + "st1 { v16.b }[4], [x26]\n" + "st1 { v20.b }[4], [x21]\n" + "st1 { v24.b }[4], [x20]\n" + "st1 { v28.b }[4], [x19]\n" + "b 118f\n" + "116:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x9, #1, 117f\n" + "str h16, [x26], #0x2\n" + "str h20, [x21], #0x2\n" + "str h24, [x20], #0x2\n" + "str h28, [x19], #0x2\n" + "tbz x9, #0, 118f\n" + "st1 { v16.b }[2], [x26]\n" + "st1 { v20.b }[2], [x21]\n" + "st1 { v24.b }[2], [x20]\n" + "st1 { v28.b }[2], [x19]\n" + "b 118f\n" + "117:" // Height 4: Partial direct writeback: partial_1_0 + "str b16, [x26, #0x0]\n" + "str b20, [x21, #0x0]\n" + "str b24, [x20, #0x0]\n" + "str b28, [x19, #0x0]\n" + "118:" // Height 4: Partial direct writeback: Done + "b 120f\n" + "119:" // Height 4: Full writeback + "str q16, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x21, #0x0]\n" + "str q24, [x20, #0x0]\n" + "str q28, [x19, #0x0]\n" + "120:" // Height 4: Writeback done + "subs x9, x9, #0x10\n" + "bgt 92b\n" "subs %x[M], %x[M], #0x4\n" - "beq 126f\n" + "beq 122f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 125f\n" + "tbz %x[flags], #3, 121f\n" "add x20, x20, #0x4\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "125:" // Update direct input + "121:" // Update direct input "mov x19, #0x4\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "126:" // Exit + "122:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp index 58fbdcf2a8..da07fc17a1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp @@ -37,9 +37,9 @@ namespace arm_gemm { - // Actual kernel implementations void a64_hybrid_u8u32_dot_6x16( ARGLIST ); +void a64_hybrid_u8u32_dot_6x16_a55( ARGLIST ); class cls_a64_hybrid_u8u32_dot_6x16 { @@ -72,10 +72,11 @@ public: StdTransformsFixed transforms = {}; - static PerformanceParameters get_performance_parameters(const CPUInfo *ci) { + static PerformanceParameters get_performance_parameters(const CPUInfo *ci) + { switch (ci->get_cpu_model()) { case CPUModel::A55r1: - return { 9.5238, 2.0799, 0.2279 }; + return { 12.667, 2.0799, 0.2279 }; default: return { 29.6736, 11.4025, 0.5591 }; } @@ -83,9 +84,15 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_u8u32_dot_6x16; - - cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *) + cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A55r1: + kernel=a64_hybrid_u8u32_dot_6x16_a55; + break; + } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp new file mode 100644 index 0000000000..f131eb2eb5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp @@ -0,0 +1,3499 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "../../utils.hpp" + +#include + +namespace arm_gemm { + +void a64_hybrid_u8u32_dot_6x16_a55 ( + unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg output_arg, + const uint32_t *, Activation, bool accumulate +) +{ + struct KernelArgs { + unsigned int num_strings = {}; + const unsigned int *string_lengths = {}; + size_t N = {}; + const uint8_t *B_ptr = {}; + size_t output_offset = {}; + size_t input_initial_col = {}; + size_t input_offset = {}; + } ka; + + unsigned long flags=0; + void *output_ptr; + void *input_ptr; + + if (output_arg.is_indirect) { + output_ptr=(void *)(output_arg.indirect.ptr); + ka.output_offset=output_arg.indirect.offset; + flags |= 0x4; + } else { + output_ptr=(void *)(output_arg.direct.base); + ka.output_offset=output_arg.direct.stride; + } + + if (A_arg.is_indirect) { + input_ptr=(void *)(A_arg.indirect.ptr); + ka.input_offset=A_arg.indirect.start_row; + ka.input_initial_col=A_arg.indirect.start_col; + flags |= 0x8; + } else { + assert(num_strings==1); + input_ptr=(void *)(A_arg.direct.base); + ka.input_offset=A_arg.direct.stride; + } + if (accumulate) { + flags |= 0x1; + } + ka.num_strings = num_strings; + ka.string_lengths = string_lengths; + ka.N = N; + ka.B_ptr = B_ptr; + __asm__ __volatile__( + + "1:" // Row loop + "cmp %x[M], #0x6\n" + "bge 171f\n" + "cmp %x[M], #0x4\n" + "bgt 137f\n" + "beq 103f\n" + "cmp %x[M], #0x2\n" + "bgt 69f\n" + "beq 35f\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x12, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "tbz %x[flags], #0, 12f\n" + "cmp x14, #0x10\n" + "bge 11f\n" + "tbz x14, #3, 6f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "tbz x14, #2, 4f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "tbz x14, #1, 3f\n" + "mov x24, #0x38\n" + "ldr d11, [x12], #0x8\n" + "tbz x14, #0, 10f\n" + "ld1 { v11.s }[2], [x12]\n" + "b 10f\n" + "3:" // Height 1: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 10f\n" + "ldr s11, [x12, #0x0]\n" + "b 10f\n" + "4:" // Height 1: Partial accumulate: partial_2_8 + "tbz x14, #1, 5f\n" + "ldr d10, [x12], #0x8\n" + "mov x24, #0x28\n" + "tbz x14, #0, 10f\n" + "ld1 { v10.s }[2], [x12]\n" + "b 10f\n" + "5:" // Height 1: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 10f\n" + "ldr s10, [x12, #0x0]\n" + "b 10f\n" + "6:" // Height 1: Partial accumulate: partial_4_0 + "tbz x14, #2, 8f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "tbz x14, #1, 7f\n" + "mov x24, #0x18\n" + "ldr d9, [x12], #0x8\n" + "tbz x14, #0, 10f\n" + "ld1 { v9.s }[2], [x12]\n" + "b 10f\n" + "7:" // Height 1: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 10f\n" + "ldr s9, [x12, #0x0]\n" + "b 10f\n" + "8:" // Height 1: Partial accumulate: partial_2_0 + "tbz x14, #1, 9f\n" + "ldr d8, [x12], #0x8\n" + "mov x24, #0x8\n" + "tbz x14, #0, 10f\n" + "ld1 { v8.s }[2], [x12]\n" + "b 10f\n" + "9:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "10:" // Height 1: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 13f\n" + "11:" // Height 1: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "b 13f\n" + "12:" // Height 1: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "13:" // Height 1: setup done + "mov x11, #0x0\n" + "14:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 15f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "cbnz x11, 16f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "b 16f\n" + "15:" // Height 1: setup direct input + "mov x9, %x[input_ptr]\n" + "16:" // Height 1: input setup done + "cmp x10, #0x10\n" + "blt 19f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "cmp x10, #0x20\n" + "blt 18f\n" + "17:" // Height 1: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "ldr x19, [x13, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "ldr x28, [x13, #0x28]\n" + "cmp x10, #0x20\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x13, #0x38]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr d6, [x13, #0x40]\n" + "ldr x28, [x13, #0x48]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0x58]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "ldr d7, [x13, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x13, #0x60]\n" + "ldr x28, [x13, #0x68]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0x78]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + "ldr d6, [x13, #0x80]\n" + "ldr x28, [x13, #0x88]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0x98]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + "ldr d7, [x13, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x13, #0xa0]\n" + "ldr x28, [x13, #0xa8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xb8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x28, [x13, #0xc8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xd8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + "ldr d7, [x13, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xf8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "ldr d6, [x13, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + "mov v7.d[1], x19\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "mov v0.d[1], x27\n" + "bge 17b\n" + "18:" // Height 1: Multiply loop: Single iteration only + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "ldr q6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + "ldr q7, [x13, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + "ldr q7, [x13, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "19:" // Height 1: Multiply loop: Main loop skip + "cbz x10, 24f\n" + "cmp x10, #0x4\n" + "blt 21f\n" + "20:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr q6, [x13, #0x0]\n" + "cmp x10, #0x4\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "bge 20b\n" + "cbz x10, 24f\n" + "21:" // Height 1: Multiply loop: Skip odd blocks + "tbz x10, #1, 22f\n" + "ldr h0, [x9], #0x2\n" + "tbz x10, #0, 23f\n" + "ld1 { v0.b }[2], [x9]\n" + "b 23f\n" + "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "23:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "24:" // Height 1: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 14b\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "bge 33f\n" + "tbz x14, #3, 28f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "tbz x14, #2, 26f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "tbz x14, #1, 25f\n" + "str d11, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v11.s }[2], [x12]\n" + "b 32f\n" + "25:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x14, #0, 32f\n" + "str s11, [x12, #0x0]\n" + "b 32f\n" + "26:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x14, #1, 27f\n" + "str d10, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v10.s }[2], [x12]\n" + "b 32f\n" + "27:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x14, #0, 32f\n" + "str s10, [x12, #0x0]\n" + "b 32f\n" + "28:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x14, #2, 30f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "tbz x14, #1, 29f\n" + "str d9, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v9.s }[2], [x12]\n" + "b 32f\n" + "29:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x14, #0, 32f\n" + "str s9, [x12, #0x0]\n" + "b 32f\n" + "30:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x14, #1, 31f\n" + "str d8, [x12], #0x8\n" + "tbz x14, #0, 32f\n" + "st1 { v8.s }[2], [x12]\n" + "b 32f\n" + "31:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "32:" // Height 1: Partial direct writeback: Done + "b 34f\n" + "33:" // Height 1: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "34:" // Height 1: Writeback done + "subs x14, x14, #0x10\n" + "bgt 2b\n" + "b 206f\n" + "35:" // Height 2 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "36:" // Height 2: Column loop + "tbz %x[flags], #0, 46f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "bge 45f\n" + "tbz x14, #3, 40f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "tbz x14, #2, 38f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x14, #1, 37f\n" + "mov x24, #0x38\n" + "ldr d11, [x12], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x14, #0, 44f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 44f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x14, #1, 39f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "tbz x14, #0, 44f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 44f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x14, #2, 42f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "tbz x14, #1, 41f\n" + "mov x24, #0x18\n" + "ldr d9, [x12], #0x8\n" + "ldr d13, [x23], #0x8\n" + "tbz x14, #0, 44f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 44f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x14, #1, 43f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "tbz x14, #0, 44f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "44:" // Height 2: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "b 47f\n" + "46:" // Height 2: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "47:" // Height 2: setup done + "mov x11, #0x0\n" + "48:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "cbnz x11, 50f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "50:" // Height 2: input setup done + "cmp x10, #0x10\n" + "blt 53f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q6, [x13, #0x0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x13, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "sub x10, x10, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "cmp x10, #0x20\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x28]\n" + "ldr x19, [x13, #0x38]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x40]\n" + "ldr x28, [x13, #0x48]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "ldr d7, [x13, #0x50]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0x58]\n" + "ldr x28, [x13, #0x68]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x60]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0x70]\n" + "ldr x28, [x13, #0x88]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x80]\n" + "ldr x19, [x13, #0x98]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0x90]\n" + "ldr x28, [x13, #0xa8]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xa0]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xb0]\n" + "ldr x28, [x13, #0xc8]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x19, [x13, #0xd8]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xd0]\n" + "ldr x28, [x13, #0xe8]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x19, [x13, #0xf8]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x27, [x9, #0x8]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr x25, [x26, #0x8]\n" + "ldr d6, [x13, #0x0]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x25\n" + "mov v6.d[1], x28\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x13, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "ldr q7, [x13, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + "ldr q7, [x13, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + "ldr q7, [x13, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x10, 58f\n" + "cmp x10, #0x4\n" + "blt 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "bge 54b\n" + "cbz x10, 58f\n" + "55:" // Height 2: Multiply loop: Skip odd blocks + "tbz x10, #1, 56f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "tbz x10, #0, 57f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "b 57f\n" + "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "57:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "58:" // Height 2: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 48b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "bge 67f\n" + "tbz x14, #3, 62f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "tbz x14, #2, 60f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "tbz x14, #1, 59f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "b 66f\n" + "59:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x14, #0, 66f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "b 66f\n" + "60:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x14, #1, 61f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "b 66f\n" + "61:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x14, #0, 66f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "b 66f\n" + "62:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x14, #2, 64f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "tbz x14, #1, 63f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "b 66f\n" + "63:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x14, #0, 66f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "b 66f\n" + "64:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x14, #1, 65f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "tbz x14, #0, 66f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "b 66f\n" + "65:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "66:" // Height 2: Partial direct writeback: Done + "b 68f\n" + "67:" // Height 2: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "68:" // Height 2: Writeback done + "subs x14, x14, #0x10\n" + "bgt 36b\n" + "b 206f\n" + "69:" // Height 3 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "70:" // Height 3: Column loop + "tbz %x[flags], #0, 80f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 79f\n" + "tbz x14, #3, 74f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x14, #2, 72f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x14, #1, 71f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 78f\n" + "71:" // Height 3: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 78f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 78f\n" + "72:" // Height 3: Partial accumulate: partial_2_8 + "tbz x14, #1, 73f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 78f\n" + "73:" // Height 3: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 78f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 78f\n" + "74:" // Height 3: Partial accumulate: partial_4_0 + "tbz x14, #2, 76f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "tbz x14, #1, 75f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 78f\n" + "75:" // Height 3: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 78f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "b 78f\n" + "76:" // Height 3: Partial accumulate: partial_2_0 + "tbz x14, #1, 77f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "tbz x14, #0, 78f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "b 78f\n" + "77:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "78:" // Height 3: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 81f\n" + "79:" // Height 3: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "b 81f\n" + "80:" // Height 3: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "81:" // Height 3: setup done + "mov x11, #0x0\n" + "82:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 83f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "cbnz x11, 84f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "b 84f\n" + "83:" // Height 3: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "84:" // Height 3: input setup done + "cmp x10, #0x10\n" + "blt 87f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 86f\n" + "85:" // Height 3: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "ldr x19, [x13, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x26, x26, #0x10\n" + "ldr x28, [x13, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x13, #0x38]\n" + "sub x10, x10, #0x10\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "cmp x10, #0x20\n" + "ldr d6, [x13, #0x40]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x48]\n" + "ldr x19, [x13, #0x58]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "ldr d7, [x13, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x60]\n" + "ldr x28, [x13, #0x68]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0x78]\n" + "ldr x28, [x13, #0x88]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + "mov v7.d[1], x19\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + "ldr d6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + "ldr d7, [x13, #0x90]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0x98]\n" + "ldr x28, [x13, #0xa8]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x13, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0xb8]\n" + "ldr x28, [x13, #0xc8]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + "mov v7.d[1], x19\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + "ldr d6, [x13, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + "ldr d7, [x13, #0xd0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0xd8]\n" + "ldr x28, [x13, #0xe8]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + "ldr d6, [x13, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "mov v6.d[1], x28\n" + "ldr x19, [x13, #0xf8]\n" + "ldr x27, [x9, #0x8]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + "mov v7.d[1], x19\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + "ldr x25, [x26, #0x8]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr x21, [x24, #0x8]\n" + "mov v1.d[1], x25\n" + "ldr d6, [x13, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + "mov v2.d[1], x21\n" + "mov v6.d[1], x28\n" + "bge 85b\n" + "86:" // Height 3: Multiply loop: Single iteration only + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "ldr q6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr q7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "ldr q7, [x13, #0x50]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + "ldr q6, [x13, #0x60]\n" + "ldr q7, [x13, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + "ldr q6, [x13, #0x80]\n" + "ldr q7, [x13, #0x90]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + "ldr q6, [x13, #0xa0]\n" + "ldr q7, [x13, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + "ldr q6, [x13, #0xc0]\n" + "ldr q7, [x13, #0xd0]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + "ldr q6, [x13, #0xe0]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "87:" // Height 3: Multiply loop: Main loop skip + "cbz x10, 92f\n" + "cmp x10, #0x4\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "bge 88b\n" + "cbz x10, 92f\n" + "89:" // Height 3: Multiply loop: Skip odd blocks + "tbz x10, #1, 90f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "tbz x10, #0, 91f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "b 91f\n" + "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "91:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + "92:" // Height 3: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 82b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "bge 101f\n" + "tbz x14, #3, 96f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x14, #2, 94f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x14, #1, 93f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 100f\n" + "93:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x14, #0, 100f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 100f\n" + "94:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x14, #1, 95f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 100f\n" + "95:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x14, #0, 100f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 100f\n" + "96:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x14, #2, 98f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x14, #1, 97f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 100f\n" + "97:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x14, #0, 100f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 100f\n" + "98:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x14, #1, 99f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x14, #0, 100f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 100f\n" + "99:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "100:" // Height 3: Partial direct writeback: Done + "b 102f\n" + "101:" // Height 3: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "102:" // Height 3: Writeback done + "subs x14, x14, #0x10\n" + "bgt 70b\n" + "b 206f\n" + "103:" // Height 4 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "104:" // Height 4: Column loop + "tbz %x[flags], #0, 114f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 113f\n" + "tbz x14, #3, 108f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x14, #2, 106f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x14, #1, 105f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 112f\n" + "105:" // Height 4: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 112f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 112f\n" + "106:" // Height 4: Partial accumulate: partial_2_8 + "tbz x14, #1, 107f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 112f\n" + "107:" // Height 4: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 112f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 112f\n" + "108:" // Height 4: Partial accumulate: partial_4_0 + "tbz x14, #2, 110f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x14, #1, 109f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 112f\n" + "109:" // Height 4: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 112f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 112f\n" + "110:" // Height 4: Partial accumulate: partial_2_0 + "tbz x14, #1, 111f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x14, #0, 112f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 112f\n" + "111:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "112:" // Height 4: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 115f\n" + "113:" // Height 4: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "b 115f\n" + "114:" // Height 4: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "115:" // Height 4: setup done + "mov x11, #0x0\n" + "116:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 117f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "cbnz x11, 118f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 118f\n" + "117:" // Height 4: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "118:" // Height 4: input setup done + "cmp x10, #0x10\n" + "blt 121f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 120f\n" + "119:" // Height 4: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x13, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x23, x23, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x28]\n" + "ldr x19, [x13, #0x38]\n" + "sub x10, x10, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr d6, [x13, #0x40]\n" + "cmp x10, #0x20\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "ldr x28, [x13, #0x48]\n" + "ldr d7, [x13, #0x50]\n" + "ldr x19, [x13, #0x58]\n" + "mov v6.d[1], x28\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr x28, [x13, #0x68]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x60]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + "ldr x28, [x13, #0x88]\n" + "ldr d7, [x13, #0x90]\n" + "ldr x19, [x13, #0x98]\n" + "mov v6.d[1], x28\n" + "ldr x28, [x13, #0xa8]\n" + "ldr x27, [x9, #0x8]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xa0]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x28, [x13, #0xc8]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xd0]\n" + "ldr x19, [x13, #0xd8]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x19, [x13, #0xf8]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + "mov v7.d[1], x19\n" + "ldr d6, [x13, #0x0]\n" + "ldr x28, [x13, #0x8]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + "mov v6.d[1], x28\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr x25, [x26, #0x8]\n" + "mov v0.d[1], x27\n" + "ldr d2, [x24, #0x0]\n" + "ldr x21, [x24, #0x8]\n" + "mov v1.d[1], x25\n" + "ldr d3, [x23, #0x0]\n" + "ldr x19, [x23, #0x8]\n" + "mov v2.d[1], x21\n" + "mov v3.d[1], x19\n" + "bge 119b\n" + "120:" // Height 4: Multiply loop: Single iteration only + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x13, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + "ldr q7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x13, #0x90]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + "ldr q7, [x13, #0xd0]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + "121:" // Height 4: Multiply loop: Main loop skip + "cbz x10, 126f\n" + "cmp x10, #0x4\n" + "blt 123f\n" + "122:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "bge 122b\n" + "cbz x10, 126f\n" + "123:" // Height 4: Multiply loop: Skip odd blocks + "tbz x10, #1, 124f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "tbz x10, #0, 125f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "b 125f\n" + "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "125:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + "126:" // Height 4: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 116b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "bge 135f\n" + "tbz x14, #3, 130f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "tbz x14, #2, 128f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "tbz x14, #1, 127f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "b 134f\n" + "127:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x14, #0, 134f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "b 134f\n" + "128:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x14, #1, 129f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "b 134f\n" + "129:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x14, #0, 134f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "b 134f\n" + "130:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x14, #2, 132f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "tbz x14, #1, 131f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "b 134f\n" + "131:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x14, #0, 134f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "b 134f\n" + "132:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x14, #1, 133f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x14, #0, 134f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "b 134f\n" + "133:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "134:" // Height 4: Partial direct writeback: Done + "b 136f\n" + "135:" // Height 4: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "136:" // Height 4: Writeback done + "subs x14, x14, #0x10\n" + "bgt 104b\n" + "b 206f\n" + "137:" // Height 5 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "138:" // Height 5: Column loop + "tbz %x[flags], #0, 148f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 147f\n" + "tbz x14, #3, 142f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "tbz x14, #2, 140f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x14, #1, 139f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 146f\n" + "139:" // Height 5: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 146f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 146f\n" + "140:" // Height 5: Partial accumulate: partial_2_8 + "tbz x14, #1, 141f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 146f\n" + "141:" // Height 5: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 146f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "b 146f\n" + "142:" // Height 5: Partial accumulate: partial_4_0 + "tbz x14, #2, 144f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "tbz x14, #1, 143f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "b 146f\n" + "143:" // Height 5: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 146f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "b 146f\n" + "144:" // Height 5: Partial accumulate: partial_2_0 + "tbz x14, #1, 145f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "tbz x14, #0, 146f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "b 146f\n" + "145:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "146:" // Height 5: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 149f\n" + "147:" // Height 5: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "b 149f\n" + "148:" // Height 5: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "149:" // Height 5: setup done + "mov x11, #0x0\n" + "150:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x11, 152f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "152:" // Height 5: input setup done + "cmp x10, #0x10\n" + "blt 155f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "ldr x19, [x13, #0x18]\n" + "add x9, x9, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x26, x26, #0x10\n" + "ldr x28, [x13, #0x28]\n" + "add x24, x24, #0x10\n" + "mov v7.d[1], x19\n" + "prfm pldl1keep, [x9, #0x80]\n" + "ldr x19, [x13, #0x38]\n" + "add x23, x23, #0x10\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x22, x22, #0x10\n" + "ldr d6, [x13, #0x40]\n" + "sub x10, x10, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x48]\n" + "ldr x19, [x13, #0x58]\n" + "cmp x10, #0x20\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + "ldr d7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr d6, [x13, #0x60]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x68]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x13, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x88]\n" + "ldr x19, [x13, #0x98]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + "ldr d7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d6, [x13, #0xa0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0xa8]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + "ldr d6, [x13, #0xc0]\n" + "ldr x28, [x13, #0xc8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xd8]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + "ldr d7, [x13, #0xd0]\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xf8]\n" + "ldr x25, [x26, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x21, [x24, #0x8]\n" + "add x13, x13, #0x100\n" + "ldr d6, [x13, #0x0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x8]\n" + "ldr x19, [x23, #0x8]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr d3, [x23, #0x0]\n" + "mov v1.d[1], x25\n" + "ldr d4, [x22, #0x0]\n" + "mov v2.d[1], x21\n" + "ldr x21, [x22, #0x8]\n" + "mov v3.d[1], x19\n" + "mov v4.d[1], x21\n" + "bge 153b\n" + "154:" // Height 5: Multiply loop: Single iteration only + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "ldr q6, [x13, #0x20]\n" + "sub x10, x10, #0x10\n" + "add x9, x9, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr q7, [x13, #0x30]\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x22, x22, #0x10\n" + "ldr q6, [x13, #0x40]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + "ldr q7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + "ldr q7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + "ldr q7, [x13, #0xd0]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + "155:" // Height 5: Multiply loop: Main loop skip + "cbz x10, 160f\n" + "cmp x10, #0x4\n" + "blt 157f\n" + "156:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + "bge 156b\n" + "cbz x10, 160f\n" + "157:" // Height 5: Multiply loop: Skip odd blocks + "tbz x10, #1, 158f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "tbz x10, #0, 159f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "b 159f\n" + "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "159:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + "160:" // Height 5: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 150b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "bge 169f\n" + "tbz x14, #3, 164f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x14, #2, 162f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x14, #1, 161f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 168f\n" + "161:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x14, #0, 168f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 168f\n" + "162:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x14, #1, 163f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 168f\n" + "163:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x14, #0, 168f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 168f\n" + "164:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x14, #2, 166f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x14, #1, 165f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 168f\n" + "165:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x14, #0, 168f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 168f\n" + "166:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x14, #1, 167f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x14, #0, 168f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 168f\n" + "167:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "168:" // Height 5: Partial direct writeback: Done + "b 170f\n" + "169:" // Height 5: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "170:" // Height 5: Writeback done + "subs x14, x14, #0x10\n" + "bgt 138b\n" + "b 206f\n" + "171:" // Height 6 + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x12, %x[output_ptr]\n" + "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "172:" // Height 6: Column loop + "tbz %x[flags], #0, 182f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "add x19, x20, x19, LSL #2\n" + "bge 181f\n" + "tbz x14, #3, 176f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x12], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v29.4s }, [x19], #0x10\n" + "tbz x14, #2, 174f\n" + "ld1 { v10.4s }, [x12], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v30.4s }, [x19], #0x10\n" + "tbz x14, #1, 173f\n" + "ldr d11, [x12], #0x8\n" + "mov x24, #0x38\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "ldr d31, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v11.s }[2], [x12]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "ld1 { v31.s }[2], [x19]\n" + "b 180f\n" + "173:" // Height 6: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x14, #0, 180f\n" + "ldr s11, [x12, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "ldr s31, [x19, #0x0]\n" + "b 180f\n" + "174:" // Height 6: Partial accumulate: partial_2_8 + "tbz x14, #1, 175f\n" + "ldr d10, [x12], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "ldr d30, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v10.s }[2], [x12]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "ld1 { v30.s }[2], [x19]\n" + "b 180f\n" + "175:" // Height 6: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x14, #0, 180f\n" + "ldr s10, [x12, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "ldr s30, [x19, #0x0]\n" + "b 180f\n" + "176:" // Height 6: Partial accumulate: partial_4_0 + "tbz x14, #2, 178f\n" + "ld1 { v8.4s }, [x12], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "tbz x14, #1, 177f\n" + "ldr d9, [x12], #0x8\n" + "mov x24, #0x18\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "ldr d29, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v9.s }[2], [x12]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "ld1 { v29.s }[2], [x19]\n" + "b 180f\n" + "177:" // Height 6: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x14, #0, 180f\n" + "ldr s9, [x12, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "ldr s29, [x19, #0x0]\n" + "b 180f\n" + "178:" // Height 6: Partial accumulate: partial_2_0 + "tbz x14, #1, 179f\n" + "ldr d8, [x12], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "ldr d28, [x19], #0x8\n" + "tbz x14, #0, 180f\n" + "ld1 { v8.s }[2], [x12]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x19]\n" + "b 180f\n" + "179:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x12, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "ldr s28, [x19, #0x0]\n" + "180:" // Height 6: Partial accumulate: Done + "sub x12, x12, x24\n" + "b 183f\n" + "181:" // Height 6: full accumulate + "ldr q8, [x12, #0x0]\n" + "ldr q9, [x12, #0x10]\n" + "ldr q10, [x12, #0x20]\n" + "ldr q11, [x12, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "ldr q28, [x19, #0x0]\n" + "ldr q29, [x19, #0x10]\n" + "ldr q30, [x19, #0x20]\n" + "ldr q31, [x19, #0x30]\n" + "b 183f\n" + "182:" // Height 6: no accumulate + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "183:" // Height 6: setup done + "mov x11, #0x0\n" + "184:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "tbz %x[flags], #3, 185f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x19, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x20, [x20, #0x28]\n" + "cbnz x11, 186f\n" + "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x9, x9, x19\n" + "add x26, x26, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "add x20, x20, x19\n" + "b 186f\n" + "185:" // Height 6: setup direct input + "mov x9, %x[input_ptr]\n" + "add x26, x9, x19\n" + "add x24, x26, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x20, x22, x19\n" + "186:" // Height 6: input setup done + "cmp x10, #0x10\n" + "blt 189f\n" + "ldr q0, [x9, #0x0]\n" + "ldr q1, [x26, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x20, #0x0]\n" + "ldr q6, [x13, #0x0]\n" + "blt 188f\n" + "187:" // Height 6: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + "ldr d7, [x13, #0x10]\n" + "add x9, x9, #0x10\n" + "ldr x19, [x13, #0x18]\n" + "add x26, x26, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x24, x24, #0x10\n" + "ldr d6, [x13, #0x20]\n" + "add x23, x23, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x28]\n" + "ldr x19, [x13, #0x38]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + "ldr d7, [x13, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x20, x20, #0x10\n" + "ldr d6, [x13, #0x40]\n" + "sub x10, x10, #0x10\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x48]\n" + "ldr x19, [x13, #0x58]\n" + "cmp x10, #0x20\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" + "ldr d7, [x13, #0x50]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr d6, [x13, #0x60]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x68]\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" + "ldr d7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr d6, [x13, #0x80]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x88]\n" + "ldr x19, [x13, #0x98]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" + "ldr d7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr d6, [x13, #0xa0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0xa8]\n" + "ldr x19, [x13, #0xb8]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" + "ldr d7, [x13, #0xb0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr d6, [x13, #0xc0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0xc8]\n" + "ldr x19, [x13, #0xd8]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + "mov v6.d[1], x28\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" + "ldr d7, [x13, #0xd0]\n" + "ldr d6, [x13, #0xe0]\n" + "ldr x28, [x13, #0xe8]\n" + "mov v7.d[1], x19\n" + "ldr x19, [x13, #0xf8]\n" + "ldr x27, [x9, #0x8]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n" + "ldr d7, [x13, #0xf0]\n" + "ldr x25, [x26, #0x8]\n" + "add x13, x13, #0x100\n" + "ldr d6, [x13, #0x0]\n" + "mov v7.d[1], x19\n" + "ldr x28, [x13, #0x8]\n" + "ldr x21, [x24, #0x8]\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "mov v6.d[1], x28\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" + "ldr d0, [x9, #0x0]\n" + "ldr d1, [x26, #0x0]\n" + "ldr d2, [x24, #0x0]\n" + "mov v0.d[1], x27\n" + "ldr d3, [x23, #0x0]\n" + "mov v1.d[1], x25\n" + "ldr x19, [x23, #0x8]\n" + "mov v2.d[1], x21\n" + "ldr d4, [x22, #0x0]\n" + "ldr x21, [x22, #0x8]\n" + "mov v3.d[1], x19\n" + "ldr d5, [x20, #0x0]\n" + "ldr x19, [x20, #0x8]\n" + "mov v4.d[1], x21\n" + "mov v5.d[1], x19\n" + "bge 187b\n" + "188:" // Height 6: Multiply loop: Single iteration only + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + "ldr q7, [x13, #0x10]\n" + "sub x10, x10, #0x10\n" + "ldr q6, [x13, #0x20]\n" + "add x9, x9, #0x10\n" + "prfm pldl1keep, [x9, #0x80]\n" + "add x26, x26, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + "ldr q7, [x13, #0x30]\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x23, x23, #0x10\n" + "ldr q6, [x13, #0x40]\n" + "add x22, x22, #0x10\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" + "ldr q7, [x13, #0x50]\n" + "add x20, x20, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x13, #0x60]\n" + ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" + ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" + ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" + ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" + ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" + ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" + "ldr q7, [x13, #0x70]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q6, [x13, #0x80]\n" + ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" + ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" + ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" + ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" + ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" + ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" + "ldr q7, [x13, #0x90]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "ldr q6, [x13, #0xa0]\n" + ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" + ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" + ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" + ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" + ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" + ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" + "ldr q7, [x13, #0xb0]\n" + "prfm pldl1keep, [x20, #0x80]\n" + "ldr q6, [x13, #0xc0]\n" + ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" + ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" + ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" + ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" + ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" + "ldr q7, [x13, #0xd0]\n" + "ldr q6, [x13, #0xe0]\n" + ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" + ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n" + "ldr q7, [x13, #0xf0]\n" + "add x13, x13, #0x100\n" + ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" + "189:" // Height 6: Multiply loop: Main loop skip + "cbz x10, 194f\n" + "cmp x10, #0x4\n" + "blt 191f\n" + "190:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x9], #0x4\n" + "sub x10, x10, #0x4\n" + "ldr s1, [x26], #0x4\n" + "cmp x10, #0x4\n" + "ldr s2, [x24], #0x4\n" + "ldr s3, [x23], #0x4\n" + "ldr s4, [x22], #0x4\n" + "ldr s5, [x20], #0x4\n" + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + "bge 190b\n" + "cbz x10, 194f\n" + "191:" // Height 6: Multiply loop: Skip odd blocks + "tbz x10, #1, 192f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x20], #0x2\n" + "tbz x10, #0, 193f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x26]\n" + "ld1 { v2.b }[2], [x24]\n" + "ld1 { v3.b }[2], [x23]\n" + "ld1 { v4.b }[2], [x22]\n" + "ld1 { v5.b }[2], [x20]\n" + "b 193f\n" + "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x26, #0x0]\n" + "ldr b2, [x24, #0x0]\n" + "ldr b3, [x23, #0x0]\n" + "ldr b4, [x22, #0x0]\n" + "ldr b5, [x20, #0x0]\n" + "193:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x13, #0x0]\n" + "ldr q7, [x13, #0x10]\n" + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" + "ldr q6, [x13, #0x20]\n" + "ldr q7, [x13, #0x30]\n" + "add x13, x13, #0x40\n" + ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + "194:" // Height 6: Multiply loop: No odd multiplies + "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x11, x11, #0x1\n" + "cmp x11, x19\n" + "bne 184b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "cmp x14, #0x10\n" + "add x23, x12, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19, LSL #2\n" + "prfm pstl1keep, [x19, #0x0]\n" + "bge 203f\n" + "tbz x14, #3, 198f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v9.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "st1 { v29.4s }, [x19], #0x10\n" + "tbz x14, #2, 196f\n" + "st1 { v10.4s }, [x12], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "st1 { v30.4s }, [x19], #0x10\n" + "tbz x14, #1, 195f\n" + "str d11, [x12], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "str d31, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v11.s }[2], [x12]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "st1 { v31.s }[2], [x19]\n" + "b 202f\n" + "195:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x14, #0, 202f\n" + "str s11, [x12, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "str s31, [x19, #0x0]\n" + "b 202f\n" + "196:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x14, #1, 197f\n" + "str d10, [x12], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "str d30, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v10.s }[2], [x12]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "st1 { v30.s }[2], [x19]\n" + "b 202f\n" + "197:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x14, #0, 202f\n" + "str s10, [x12, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "str s30, [x19, #0x0]\n" + "b 202f\n" + "198:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x14, #2, 200f\n" + "st1 { v8.4s }, [x12], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "tbz x14, #1, 199f\n" + "str d9, [x12], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "str d29, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v9.s }[2], [x12]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19]\n" + "b 202f\n" + "199:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x14, #0, 202f\n" + "str s9, [x12, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "str s29, [x19, #0x0]\n" + "b 202f\n" + "200:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x14, #1, 201f\n" + "str d8, [x12], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x14, #0, 202f\n" + "st1 { v8.s }[2], [x12]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "st1 { v28.s }[2], [x19]\n" + "b 202f\n" + "201:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x12, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "str s28, [x19, #0x0]\n" + "202:" // Height 6: Partial direct writeback: Done + "b 204f\n" + "203:" // Height 6: Full writeback + "str q8, [x12, #0x0]\n" + "str q9, [x12, #0x10]\n" + "str q10, [x12, #0x20]\n" + "str q11, [x12, #0x30]\n" + "add x12, x12, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "str q28, [x19, #0x0]\n" + "str q29, [x19, #0x10]\n" + "str q30, [x19, #0x20]\n" + "str q31, [x19, #0x30]\n" + "204:" // Height 6: Writeback done + "subs x14, x14, #0x10\n" + "bgt 172b\n" + "subs %x[M], %x[M], #0x6\n" + "beq 206f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 205f\n" + "add x20, x20, #0x6\n" + "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "205:" // Update direct input + "mov x19, #0x6\n" + "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" + "b 1b\n" + "206:" // Exit + + : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) + : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace arm_gemm +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp index 89aac84cc5..ba57ad493a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp @@ -80,392 +80,378 @@ void a64_hybrid_u8u32_dot_6x16 ( "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 176f\n" + "bge 171f\n" "cmp %x[M], #0x4\n" - "bgt 141f\n" - "beq 106f\n" + "bgt 137f\n" + "beq 103f\n" "cmp %x[M], #0x2\n" - "bgt 71f\n" - "beq 36f\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop - "tbz %x[flags], #0, 13f\n" - "cmp x15, #0x10\n" - "bge 12f\n" - "tbz x15, #3, 7f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "tbz x15, #2, 5f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "tbz x15, #1, 4f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "tbz x15, #0, 11f\n" - "ld1 { v11.s }[2], [x13]\n" - "b 11f\n" - "4:" // Height 1: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 11f\n" - "ldr s11, [x13, #0x0]\n" - "b 11f\n" - "5:" // Height 1: Partial accumulate: partial_2_8 - "tbz x15, #1, 6f\n" - "ldr d10, [x13], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 11f\n" - "ld1 { v10.s }[2], [x13]\n" - "b 11f\n" - "6:" // Height 1: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 11f\n" - "ldr s10, [x13, #0x0]\n" - "b 11f\n" - "7:" // Height 1: Partial accumulate: partial_4_0 - "tbz x15, #2, 9f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "tbz x15, #1, 8f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "tbz x15, #0, 11f\n" - "ld1 { v9.s }[2], [x13]\n" - "b 11f\n" - "8:" // Height 1: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 11f\n" - "ldr s9, [x13, #0x0]\n" - "b 11f\n" - "9:" // Height 1: Partial accumulate: partial_2_0 - "tbz x15, #1, 10f\n" - "ldr d8, [x13], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 11f\n" - "ld1 { v8.s }[2], [x13]\n" - "b 11f\n" - "10:" // Height 1: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "11:" // Height 1: Partial accumulate: Done - "sub x13, x13, x19\n" - "b 14f\n" - "12:" // Height 1: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "b 14f\n" - "13:" // Height 1: no accumulate + "bgt 69f\n" + "beq 35f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop + "tbz %x[flags], #0, 12f\n" + "cmp x10, #0x10\n" + "bge 11f\n" + "tbz x10, #3, 6f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #2, 4f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #1, 3f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v11.s }[2], [x28]\n" + "b 10f\n" + "3:" // Height 1: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 10f\n" + "ldr s11, [x28, #0x0]\n" + "b 10f\n" + "4:" // Height 1: Partial accumulate: partial_2_8 + "tbz x10, #1, 5f\n" + "ldr d10, [x28], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 10f\n" + "ld1 { v10.s }[2], [x28]\n" + "b 10f\n" + "5:" // Height 1: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 10f\n" + "ldr s10, [x28, #0x0]\n" + "b 10f\n" + "6:" // Height 1: Partial accumulate: partial_4_0 + "tbz x10, #2, 8f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "tbz x10, #1, 7f\n" + "ldr d9, [x28], #0x8\n" + "mov x24, #0x18\n" + "tbz x10, #0, 10f\n" + "ld1 { v9.s }[2], [x28]\n" + "b 10f\n" + "7:" // Height 1: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 10f\n" + "ldr s9, [x28, #0x0]\n" + "b 10f\n" + "8:" // Height 1: Partial accumulate: partial_2_0 + "tbz x10, #1, 9f\n" + "ldr d8, [x28], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 10f\n" + "ld1 { v8.s }[2], [x28]\n" + "b 10f\n" + "9:" // Height 1: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "10:" // Height 1: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 13f\n" + "11:" // Height 1: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "b 13f\n" + "12:" // Height 1: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" "movi v11.4s, #0x0\n" - "14:" // Height 1: setup done - "mov x12, #0x0\n" - "15:" // Height 1: String loop + "13:" // Height 1: setup done + "mov x27, #0x0\n" + "14:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 16f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 15f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 17f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 16f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "b 17f\n" - "16:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "17:" // Height 1: input setup done - "cmp x11, #0x10\n" - "blt 20f\n" - "cmp x11, #0x20\n" + "add x25, x25, x19\n" + "b 16f\n" + "15:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "16:" // Height 1: input setup done + "cmp x26, #0x10\n" "blt 19f\n" - "18:" // Height 1: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "cmp x26, #0x20\n" + "blt 18f\n" + "17:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x9, #0x20]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x9, #0x30]\n" + "cmp x26, #0x20\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q6, [x9, #0x40]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x60]\n" - "sub x11, x11, #0x10\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x70]\n" - "cmp x11, #0x20\n" + "ldr q7, [x9, #0x70]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - "bge 18b\n" - "19:" // Height 1: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q0, [x25, #0x0]\n" + "bge 17b\n" + "18:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x9, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "add x10, x10, #0x10\n" + "ldr q7, [x9, #0x30]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - "20:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 25f\n" - "cmp x11, #0x4\n" - "blt 22f\n" - "21:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "19:" // Height 1: Multiply loop: Main loop skip + "cbz x26, 24f\n" + "cmp x26, #0x4\n" + "blt 21f\n" + "20:" // Height 1: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "cmp x26, #0x4\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q6, [x9, #0x20]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x4\n" - "add x14, x14, #0x40\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "cmp x11, #0x4\n" - "bge 21b\n" - "cbz x11, 25f\n" - "22:" // Height 1: Multiply loop: Skip odd blocks - "tbz x11, #1, 23f\n" - "ldr h0, [x10], #0x2\n" - "tbz x11, #0, 24f\n" - "ld1 { v0.b }[2], [x10]\n" - "b 24f\n" - "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "24:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 20b\n" + "cbz x26, 24f\n" + "21:" // Height 1: Multiply loop: Skip odd blocks + "tbz x26, #1, 22f\n" + "ldr h0, [x25], #0x2\n" + "tbz x26, #0, 23f\n" + "ld1 { v0.b }[2], [x25]\n" + "b 23f\n" + "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "23:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q7, [x9, #0x10]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "25:" // Height 1: Multiply loop: No odd multiplies + "24:" // Height 1: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 15b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "cmp x15, #0x10\n" - "bge 34f\n" - "tbz x15, #3, 29f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "tbz x15, #2, 27f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "tbz x15, #1, 26f\n" - "str d11, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v11.s }[2], [x13]\n" - "b 33f\n" - "26:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 33f\n" - "str s11, [x13, #0x0]\n" - "b 33f\n" - "27:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 28f\n" - "str d10, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v10.s }[2], [x13]\n" - "b 33f\n" - "28:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 33f\n" - "str s10, [x13, #0x0]\n" - "b 33f\n" - "29:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 31f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "tbz x15, #1, 30f\n" - "str d9, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v9.s }[2], [x13]\n" - "b 33f\n" - "30:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 33f\n" - "str s9, [x13, #0x0]\n" - "b 33f\n" - "31:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 32f\n" - "str d8, [x13], #0x8\n" - "tbz x15, #0, 33f\n" - "st1 { v8.s }[2], [x13]\n" - "b 33f\n" - "32:" // Height 1: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "33:" // Height 1: Partial direct writeback: Done - "b 35f\n" - "34:" // Height 1: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "add x13, x13, #0x40\n" - "35:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" - "bgt 3b\n" - "b 212f\n" - "36:" // Height 2 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 14b\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "bge 33f\n" + "tbz x10, #3, 28f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "tbz x10, #2, 26f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "tbz x10, #1, 25f\n" + "str d11, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v11.s }[2], [x28]\n" + "b 32f\n" + "25:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x10, #0, 32f\n" + "str s11, [x28, #0x0]\n" + "b 32f\n" + "26:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x10, #1, 27f\n" + "str d10, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v10.s }[2], [x28]\n" + "b 32f\n" + "27:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x10, #0, 32f\n" + "str s10, [x28, #0x0]\n" + "b 32f\n" + "28:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x10, #2, 30f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "tbz x10, #1, 29f\n" + "str d9, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v9.s }[2], [x28]\n" + "b 32f\n" + "29:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x10, #0, 32f\n" + "str s9, [x28, #0x0]\n" + "b 32f\n" + "30:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x10, #1, 31f\n" + "str d8, [x28], #0x8\n" + "tbz x10, #0, 32f\n" + "st1 { v8.s }[2], [x28]\n" + "b 32f\n" + "31:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "32:" // Height 1: Partial direct writeback: Done + "b 34f\n" + "33:" // Height 1: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "34:" // Height 1: Writeback done + "subs x10, x10, #0x10\n" + "bgt 2b\n" + "b 206f\n" + "35:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "36:" // Height 2: Column loop + "tbz %x[flags], #0, 46f\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 37f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 38f\n" - "37:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "38:" // Height 2: Column loop - "tbz %x[flags], #0, 48f\n" - "cmp x15, #0x10\n" - "bge 47f\n" - "tbz x15, #3, 42f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "tbz x15, #2, 40f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "tbz x15, #1, 39f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "tbz x15, #0, 46f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "b 46f\n" - "39:" // Height 2: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 46f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "b 46f\n" - "40:" // Height 2: Partial accumulate: partial_2_8 - "tbz x15, #1, 41f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 46f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "b 46f\n" - "41:" // Height 2: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 46f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "b 46f\n" - "42:" // Height 2: Partial accumulate: partial_4_0 - "tbz x15, #2, 44f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "tbz x15, #1, 43f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "tbz x15, #0, 46f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "b 46f\n" - "43:" // Height 2: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 46f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "b 46f\n" - "44:" // Height 2: Partial accumulate: partial_2_0 - "tbz x15, #1, 45f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 46f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "b 46f\n" - "45:" // Height 2: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "46:" // Height 2: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "b 49f\n" - "47:" // Height 2: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "b 49f\n" - "48:" // Height 2: no accumulate + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "bge 45f\n" + "tbz x10, #3, 40f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "tbz x10, #2, 38f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "tbz x10, #1, 37f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "tbz x10, #0, 44f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 44f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x10, #1, 39f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "tbz x10, #0, 44f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 44f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x10, #2, 42f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "tbz x10, #1, 41f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "tbz x10, #0, 44f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 44f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x10, #1, 43f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "tbz x10, #0, 44f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "44:" // Height 2: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "b 47f\n" + "46:" // Height 2: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -474,405 +460,395 @@ void a64_hybrid_u8u32_dot_6x16 ( "movi v13.4s, #0x0\n" "movi v14.4s, #0x0\n" "movi v15.4s, #0x0\n" - "49:" // Height 2: setup done - "mov x12, #0x0\n" - "50:" // Height 2: String loop + "47:" // Height 2: setup done + "mov x27, #0x0\n" + "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 51f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 52f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 50f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "b 52f\n" - "51:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "52:" // Height 2: input setup done - "cmp x11, #0x10\n" - "blt 55f\n" - "cmp x11, #0x20\n" - "blt 54f\n" - "53:" // Height 2: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "50:" // Height 2: input setup done + "cmp x26, #0x10\n" + "blt 53f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q6, [x9, #0x0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x9, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "cmp x26, #0x20\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x40]\n" - "cmp x11, #0x20\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "bge 53b\n" - "54:" // Height 2: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "add x10, x10, #0x10\n" + "ldr q6, [x9, #0x20]\n" + "add x25, x25, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "55:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 60f\n" - "cmp x11, #0x4\n" - "blt 57f\n" - "56:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x26, 58f\n" + "cmp x26, #0x4\n" + "blt 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "sub x11, x11, #0x4\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "bge 56b\n" - "cbz x11, 60f\n" - "57:" // Height 2: Multiply loop: Skip odd blocks - "tbz x11, #1, 58f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "tbz x11, #0, 59f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "b 59f\n" - "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "59:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 54b\n" + "cbz x26, 58f\n" + "55:" // Height 2: Multiply loop: Skip odd blocks + "tbz x26, #1, 56f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "tbz x26, #0, 57f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "b 57f\n" + "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "57:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "60:" // Height 2: Multiply loop: No odd multiplies + "58:" // Height 2: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 50b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "bge 69f\n" - "tbz x15, #3, 64f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "tbz x15, #2, 62f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "tbz x15, #1, 61f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "b 68f\n" - "61:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 68f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "b 68f\n" - "62:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 63f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "b 68f\n" - "63:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 68f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "b 68f\n" - "64:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 66f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "tbz x15, #1, 65f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "b 68f\n" - "65:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 68f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "b 68f\n" - "66:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 67f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "tbz x15, #0, 68f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 48b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "bge 67f\n" + "tbz x10, #3, 62f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "tbz x10, #2, 60f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "tbz x10, #1, 59f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "b 66f\n" + "59:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x10, #0, 66f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "b 66f\n" + "60:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x10, #1, 61f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "b 66f\n" + "61:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x10, #0, 66f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "b 66f\n" + "62:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x10, #2, 64f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "tbz x10, #1, 63f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "b 66f\n" + "63:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x10, #0, 66f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "b 66f\n" + "64:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x10, #1, 65f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "tbz x10, #0, 66f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "b 66f\n" + "65:" // Height 2: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "66:" // Height 2: Partial direct writeback: Done "b 68f\n" - "67:" // Height 2: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "68:" // Height 2: Partial direct writeback: Done - "b 70f\n" - "69:" // Height 2: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "70:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" - "bgt 38b\n" - "b 212f\n" - "71:" // Height 3 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "67:" // Height 2: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "68:" // Height 2: Writeback done + "subs x10, x10, #0x10\n" + "bgt 36b\n" + "b 206f\n" + "69:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "70:" // Height 3: Column loop + "tbz %x[flags], #0, 80f\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 73f\n" - "72:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "73:" // Height 3: Column loop - "tbz %x[flags], #0, 83f\n" - "cmp x15, #0x10\n" - "bge 82f\n" - "tbz x15, #3, 77f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "tbz x15, #2, 75f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "tbz x15, #1, 74f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "tbz x15, #0, 81f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "b 81f\n" - "74:" // Height 3: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 81f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "b 81f\n" - "75:" // Height 3: Partial accumulate: partial_2_8 - "tbz x15, #1, 76f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 81f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "bge 79f\n" + "tbz x10, #3, 74f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #2, 72f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #1, 71f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "b 78f\n" + "71:" // Height 3: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 78f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "b 78f\n" + "72:" // Height 3: Partial accumulate: partial_2_8 + "tbz x10, #1, 73f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "b 78f\n" + "73:" // Height 3: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 78f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "b 78f\n" + "74:" // Height 3: Partial accumulate: partial_4_0 + "tbz x10, #2, 76f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "tbz x10, #1, 75f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "b 78f\n" + "75:" // Height 3: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 78f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "b 78f\n" + "76:" // Height 3: Partial accumulate: partial_2_0 + "tbz x10, #1, 77f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "tbz x10, #0, 78f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "b 78f\n" + "77:" // Height 3: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "78:" // Height 3: Partial accumulate: Done + "sub x28, x28, x24\n" "b 81f\n" - "76:" // Height 3: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 81f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" + "79:" // Height 3: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" "b 81f\n" - "77:" // Height 3: Partial accumulate: partial_4_0 - "tbz x15, #2, 79f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "tbz x15, #1, 78f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "tbz x15, #0, 81f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "b 81f\n" - "78:" // Height 3: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 81f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "b 81f\n" - "79:" // Height 3: Partial accumulate: partial_2_0 - "tbz x15, #1, 80f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 81f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "b 81f\n" - "80:" // Height 3: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "81:" // Height 3: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "b 84f\n" - "82:" // Height 3: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "b 84f\n" - "83:" // Height 3: no accumulate + "80:" // Height 3: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -885,504 +861,491 @@ void a64_hybrid_u8u32_dot_6x16 ( "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" - "84:" // Height 3: setup done - "mov x12, #0x0\n" - "85:" // Height 3: String loop + "81:" // Height 3: setup done + "mov x27, #0x0\n" + "82:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 86f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 83f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 87f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 84f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" - "b 87f\n" - "86:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "87:" // Height 3: input setup done - "cmp x11, #0x10\n" - "blt 90f\n" - "cmp x11, #0x20\n" - "blt 89f\n" - "88:" // Height 3: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 84f\n" + "83:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "84:" // Height 3: input setup done + "cmp x26, #0x10\n" + "blt 87f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 86f\n" + "85:" // Height 3: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - "bge 88b\n" - "89:" // Height 3: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "bge 85b\n" + "86:" // Height 3: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "add x24, x24, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x28, x28, #0x10\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ldr q6, [x9, #0x20]\n" + "add x23, x23, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q7, [x14, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - "90:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 95f\n" - "cmp x11, #0x4\n" - "blt 92f\n" - "91:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "87:" // Height 3: Multiply loop: Main loop skip + "cbz x26, 92f\n" + "cmp x26, #0x4\n" + "blt 89f\n" + "88:" // Height 3: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" - "cmp x11, #0x4\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "bge 91b\n" - "cbz x11, 95f\n" - "92:" // Height 3: Multiply loop: Skip odd blocks - "tbz x11, #1, 93f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "tbz x11, #0, 94f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "b 94f\n" - "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "94:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 88b\n" + "cbz x26, 92f\n" + "89:" // Height 3: Multiply loop: Skip odd blocks + "tbz x26, #1, 90f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "tbz x26, #0, 91f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "b 91f\n" + "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "91:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "95:" // Height 3: Multiply loop: No odd multiplies + "92:" // Height 3: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 85b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "bge 104f\n" - "tbz x15, #3, 99f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "tbz x15, #2, 97f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "tbz x15, #1, 96f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "b 103f\n" - "96:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 103f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "b 103f\n" - "97:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 98f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "b 103f\n" - "98:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 103f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "b 103f\n" - "99:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 101f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "tbz x15, #1, 100f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "b 103f\n" - "100:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 103f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "b 103f\n" - "101:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 102f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "tbz x15, #0, 103f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "b 103f\n" - "102:" // Height 3: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "103:" // Height 3: Partial direct writeback: Done - "b 105f\n" - "104:" // Height 3: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "105:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" - "bgt 73b\n" - "b 212f\n" - "106:" // Height 4 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 82b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 107f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 108f\n" - "107:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "108:" // Height 4: Column loop - "tbz %x[flags], #0, 118f\n" - "cmp x15, #0x10\n" - "bge 117f\n" - "tbz x15, #3, 112f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "tbz x15, #2, 110f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "tbz x15, #1, 109f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "tbz x15, #0, 116f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "b 116f\n" - "109:" // Height 4: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 116f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "b 116f\n" - "110:" // Height 4: Partial accumulate: partial_2_8 - "tbz x15, #1, 111f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 116f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "b 116f\n" - "111:" // Height 4: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 116f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "b 116f\n" - "112:" // Height 4: Partial accumulate: partial_4_0 - "tbz x15, #2, 114f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "tbz x15, #1, 113f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "tbz x15, #0, 116f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "b 116f\n" - "113:" // Height 4: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 116f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "b 116f\n" - "114:" // Height 4: Partial accumulate: partial_2_0 - "tbz x15, #1, 115f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 116f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "b 116f\n" - "115:" // Height 4: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "116:" // Height 4: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "b 119f\n" - "117:" // Height 4: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "b 119f\n" - "118:" // Height 4: no accumulate + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "bge 101f\n" + "tbz x10, #3, 96f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "tbz x10, #2, 94f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "tbz x10, #1, 93f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "b 100f\n" + "93:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x10, #0, 100f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "b 100f\n" + "94:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x10, #1, 95f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "b 100f\n" + "95:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x10, #0, 100f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "b 100f\n" + "96:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x10, #2, 98f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "tbz x10, #1, 97f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "b 100f\n" + "97:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x10, #0, 100f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "b 100f\n" + "98:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x10, #1, 99f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "tbz x10, #0, 100f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "b 100f\n" + "99:" // Height 3: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "100:" // Height 3: Partial direct writeback: Done + "b 102f\n" + "101:" // Height 3: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "102:" // Height 3: Writeback done + "subs x10, x10, #0x10\n" + "bgt 70b\n" + "b 206f\n" + "103:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "104:" // Height 4: Column loop + "tbz %x[flags], #0, 114f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "bge 113f\n" + "tbz x10, #3, 108f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "tbz x10, #2, 106f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "tbz x10, #1, 105f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "b 112f\n" + "105:" // Height 4: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 112f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "b 112f\n" + "106:" // Height 4: Partial accumulate: partial_2_8 + "tbz x10, #1, 107f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "b 112f\n" + "107:" // Height 4: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 112f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "b 112f\n" + "108:" // Height 4: Partial accumulate: partial_4_0 + "tbz x10, #2, 110f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "tbz x10, #1, 109f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "b 112f\n" + "109:" // Height 4: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 112f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "b 112f\n" + "110:" // Height 4: Partial accumulate: partial_2_0 + "tbz x10, #1, 111f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "tbz x10, #0, 112f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "b 112f\n" + "111:" // Height 4: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "112:" // Height 4: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 115f\n" + "113:" // Height 4: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "b 115f\n" + "114:" // Height 4: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -1399,220 +1362,220 @@ void a64_hybrid_u8u32_dot_6x16 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "119:" // Height 4: setup done - "mov x12, #0x0\n" - "120:" // Height 4: String loop + "115:" // Height 4: setup done + "mov x27, #0x0\n" + "116:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 121f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 117f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 122f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 118f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" - "b 122f\n" - "121:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "122:" // Height 4: input setup done - "cmp x11, #0x10\n" - "blt 125f\n" - "cmp x11, #0x20\n" - "blt 124f\n" - "123:" // Height 4: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 118f\n" + "117:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "118:" // Height 4: input setup done + "cmp x26, #0x10\n" + "blt 121f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 120f\n" + "119:" // Height 4: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x11, x11, #0x10\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - "bge 123b\n" - "124:" // Height 4: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "bge 119b\n" + "120:" // Height 4: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x26, x26, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ldr q6, [x9, #0x20]\n" + "add x22, x22, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" - "add x14, x14, #0x100\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" @@ -1621,31 +1584,31 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - "125:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 130f\n" - "cmp x11, #0x4\n" - "blt 127f\n" - "126:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "121:" // Height 4: Multiply loop: Main loop skip + "cbz x26, 126f\n" + "cmp x26, #0x4\n" + "blt 123f\n" + "122:" // Height 4: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" @@ -1653,40 +1616,40 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "bge 126b\n" - "cbz x11, 130f\n" - "127:" // Height 4: Multiply loop: Skip odd blocks - "tbz x11, #1, 128f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "tbz x11, #0, 129f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "b 129f\n" - "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "129:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 122b\n" + "cbz x26, 126f\n" + "123:" // Height 4: Multiply loop: Skip odd blocks + "tbz x26, #1, 124f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "tbz x26, #0, 125f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "b 125f\n" + "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "125:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" @@ -1694,308 +1657,292 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "130:" // Height 4: Multiply loop: No odd multiplies + "126:" // Height 4: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 120b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "bge 139f\n" - "tbz x15, #3, 134f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "tbz x15, #2, 132f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "tbz x15, #1, 131f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "b 138f\n" - "131:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 138f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "b 138f\n" - "132:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 133f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "b 138f\n" - "133:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 138f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "b 138f\n" - "134:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 136f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "tbz x15, #1, 135f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "b 138f\n" - "135:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 138f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "b 138f\n" - "136:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 137f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "tbz x15, #0, 138f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "b 138f\n" - "137:" // Height 4: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "138:" // Height 4: Partial direct writeback: Done - "b 140f\n" - "139:" // Height 4: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "140:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" - "bgt 108b\n" - "b 212f\n" - "141:" // Height 5 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 116b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 142f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 143f\n" - "142:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "143:" // Height 5: Column loop - "tbz %x[flags], #0, 153f\n" - "cmp x15, #0x10\n" - "bge 152f\n" - "tbz x15, #3, 147f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "tbz x15, #2, 145f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "tbz x15, #1, 144f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "tbz x15, #0, 151f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "b 151f\n" - "144:" // Height 5: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 151f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "b 151f\n" - "145:" // Height 5: Partial accumulate: partial_2_8 - "tbz x15, #1, 146f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 151f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "b 151f\n" - "146:" // Height 5: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 151f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "b 151f\n" - "147:" // Height 5: Partial accumulate: partial_4_0 - "tbz x15, #2, 149f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "tbz x15, #1, 148f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "tbz x15, #0, 151f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "b 151f\n" - "148:" // Height 5: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 151f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "b 151f\n" - "149:" // Height 5: Partial accumulate: partial_2_0 - "tbz x15, #1, 150f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 151f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "b 151f\n" - "150:" // Height 5: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "151:" // Height 5: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "b 154f\n" - "152:" // Height 5: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "b 154f\n" - "153:" // Height 5: no accumulate + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "bge 135f\n" + "tbz x10, #3, 130f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "tbz x10, #2, 128f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "tbz x10, #1, 127f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "b 134f\n" + "127:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x10, #0, 134f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "b 134f\n" + "128:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x10, #1, 129f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "b 134f\n" + "129:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x10, #0, 134f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "b 134f\n" + "130:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x10, #2, 132f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "tbz x10, #1, 131f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "b 134f\n" + "131:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x10, #0, 134f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "b 134f\n" + "132:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x10, #1, 133f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz x10, #0, 134f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "b 134f\n" + "133:" // Height 4: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "134:" // Height 4: Partial direct writeback: Done + "b 136f\n" + "135:" // Height 4: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "136:" // Height 4: Writeback done + "subs x10, x10, #0x10\n" + "bgt 104b\n" + "b 206f\n" + "137:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "138:" // Height 5: Column loop + "tbz %x[flags], #0, 148f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "bge 147f\n" + "tbz x10, #3, 142f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #2, 140f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #1, 139f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "b 146f\n" + "139:" // Height 5: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 146f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "b 146f\n" + "140:" // Height 5: Partial accumulate: partial_2_8 + "tbz x10, #1, 141f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "b 146f\n" + "141:" // Height 5: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 146f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "b 146f\n" + "142:" // Height 5: Partial accumulate: partial_4_0 + "tbz x10, #2, 144f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "tbz x10, #1, 143f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "b 146f\n" + "143:" // Height 5: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 146f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "b 146f\n" + "144:" // Height 5: Partial accumulate: partial_2_0 + "tbz x10, #1, 145f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "tbz x10, #0, 146f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "b 146f\n" + "145:" // Height 5: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "146:" // Height 5: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 149f\n" + "147:" // Height 5: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "b 149f\n" + "148:" // Height 5: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2016,260 +1963,260 @@ void a64_hybrid_u8u32_dot_6x16 ( "movi v25.4s, #0x0\n" "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" - "154:" // Height 5: setup done - "mov x12, #0x0\n" - "155:" // Height 5: String loop + "149:" // Height 5: setup done + "mov x27, #0x0\n" + "150:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 156f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 157f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 152f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 157f\n" - "156:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "157:" // Height 5: input setup done - "cmp x11, #0x10\n" - "blt 160f\n" - "cmp x11, #0x20\n" - "blt 159f\n" - "158:" // Height 5: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "add x21, x21, x19\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "152:" // Height 5: input setup done + "cmp x26, #0x10\n" + "blt 155f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "cmp x26, #0x20\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x11, x11, #0x10\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" - "bge 158b\n" - "159:" // Height 5: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q4, [x21, #0x0]\n" + "bge 153b\n" + "154:" // Height 5: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ldr q6, [x9, #0x20]\n" + "add x21, x21, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" @@ -2279,34 +2226,34 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" - "160:" // Height 5: Multiply loop: Main loop skip - "cbz x11, 165f\n" - "cmp x11, #0x4\n" - "blt 162f\n" - "161:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "155:" // Height 5: Multiply loop: Main loop skip + "cbz x26, 160f\n" + "cmp x26, #0x4\n" + "blt 157f\n" + "156:" // Height 5: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" @@ -2316,45 +2263,45 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "bge 161b\n" - "cbz x11, 165f\n" - "162:" // Height 5: Multiply loop: Skip odd blocks - "tbz x11, #1, 163f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" - "tbz x11, #0, 164f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "ld1 { v4.b }[2], [x22]\n" - "b 164f\n" - "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "ldr b4, [x22, #0x0]\n" - "164:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "bge 156b\n" + "cbz x26, 160f\n" + "157:" // Height 5: Multiply loop: Skip odd blocks + "tbz x26, #1, 158f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" + "tbz x26, #0, 159f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "ld1 { v4.b }[2], [x21]\n" + "b 159f\n" + "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "ldr b4, [x21, #0x0]\n" + "159:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" @@ -2364,356 +2311,338 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "165:" // Height 5: Multiply loop: No odd multiplies + "160:" // Height 5: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 155b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 150b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" - "bge 174f\n" - "tbz x15, #3, 169f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "tbz x15, #2, 167f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "tbz x15, #1, 166f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "b 173f\n" - "166:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x15, #0, 173f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "b 173f\n" - "167:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x15, #1, 168f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "b 173f\n" - "168:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x15, #0, 173f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "b 173f\n" - "169:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x15, #2, 171f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "tbz x15, #1, 170f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "b 173f\n" - "170:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x15, #0, 173f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "b 173f\n" - "171:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x15, #1, 172f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x15, #0, 173f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "b 173f\n" - "172:" // Height 5: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "173:" // Height 5: Partial direct writeback: Done - "b 175f\n" - "174:" // Height 5: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "175:" // Height 5: Writeback done - "subs x15, x15, #0x10\n" - "bgt 143b\n" - "b 212f\n" - "176:" // Height 6 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" + "prfm pstl1keep, [x21, #0x0]\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "bge 169f\n" + "tbz x10, #3, 164f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "tbz x10, #2, 162f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "tbz x10, #1, 161f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "b 168f\n" + "161:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x10, #0, 168f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "b 168f\n" + "162:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x10, #1, 163f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "b 168f\n" + "163:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x10, #0, 168f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "b 168f\n" + "164:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x10, #2, 166f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "tbz x10, #1, 165f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "b 168f\n" + "165:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x10, #0, 168f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "b 168f\n" + "166:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x10, #1, 167f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "tbz x10, #0, 168f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "b 168f\n" + "167:" // Height 5: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "168:" // Height 5: Partial direct writeback: Done + "b 170f\n" + "169:" // Height 5: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "170:" // Height 5: Writeback done + "subs x10, x10, #0x10\n" + "bgt 138b\n" + "b 206f\n" + "171:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 177f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 178f\n" - "177:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "178:" // Height 6: Column loop - "tbz %x[flags], #0, 188f\n" - "cmp x15, #0x10\n" - "bge 187f\n" - "tbz x15, #3, 182f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "ld1 { v9.4s }, [x13], #0x10\n" - "ld1 { v13.4s }, [x9], #0x10\n" - "ld1 { v17.4s }, [x27], #0x10\n" - "ld1 { v21.4s }, [x25], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v29.4s }, [x21], #0x10\n" - "tbz x15, #2, 180f\n" - "ld1 { v10.4s }, [x13], #0x10\n" - "ld1 { v14.4s }, [x9], #0x10\n" - "ld1 { v18.4s }, [x27], #0x10\n" - "ld1 { v22.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "ld1 { v30.4s }, [x21], #0x10\n" - "tbz x15, #1, 179f\n" - "mov x19, #0x38\n" - "ldr d11, [x13], #0x8\n" - "ldr d15, [x9], #0x8\n" - "ldr d19, [x27], #0x8\n" - "ldr d23, [x25], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d31, [x21], #0x8\n" - "tbz x15, #0, 186f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v15.s }[2], [x9]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v23.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x21]\n" - "b 186f\n" - "179:" // Height 6: Partial accumulate: partial_1_12 - "mov x19, #0x30\n" - "tbz x15, #0, 186f\n" - "ldr s11, [x13, #0x0]\n" - "ldr s15, [x9, #0x0]\n" - "ldr s19, [x27, #0x0]\n" - "ldr s23, [x25, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "ldr s31, [x21, #0x0]\n" - "b 186f\n" - "180:" // Height 6: Partial accumulate: partial_2_8 - "tbz x15, #1, 181f\n" - "ldr d10, [x13], #0x8\n" - "ldr d14, [x9], #0x8\n" - "ldr d18, [x27], #0x8\n" - "ldr d22, [x25], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d30, [x21], #0x8\n" - "mov x19, #0x28\n" - "tbz x15, #0, 186f\n" - "ld1 { v10.s }[2], [x13]\n" - "ld1 { v14.s }[2], [x9]\n" - "ld1 { v18.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x21]\n" - "b 186f\n" - "181:" // Height 6: Partial accumulate: partial_1_8 - "mov x19, #0x20\n" - "tbz x15, #0, 186f\n" - "ldr s10, [x13, #0x0]\n" - "ldr s14, [x9, #0x0]\n" - "ldr s18, [x27, #0x0]\n" - "ldr s22, [x25, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "ldr s30, [x21, #0x0]\n" - "b 186f\n" - "182:" // Height 6: Partial accumulate: partial_4_0 - "tbz x15, #2, 184f\n" - "ld1 { v8.4s }, [x13], #0x10\n" - "ld1 { v12.4s }, [x9], #0x10\n" - "ld1 { v16.4s }, [x27], #0x10\n" - "ld1 { v20.4s }, [x25], #0x10\n" - "ld1 { v24.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x21], #0x10\n" - "tbz x15, #1, 183f\n" - "mov x19, #0x18\n" - "ldr d9, [x13], #0x8\n" - "ldr d13, [x9], #0x8\n" - "ldr d17, [x27], #0x8\n" - "ldr d21, [x25], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d29, [x21], #0x8\n" - "tbz x15, #0, 186f\n" - "ld1 { v9.s }[2], [x13]\n" - "ld1 { v13.s }[2], [x9]\n" - "ld1 { v17.s }[2], [x27]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v25.s }[2], [x23]\n" - "ld1 { v29.s }[2], [x21]\n" - "b 186f\n" - "183:" // Height 6: Partial accumulate: partial_1_4 - "mov x19, #0x10\n" - "tbz x15, #0, 186f\n" - "ldr s9, [x13, #0x0]\n" - "ldr s13, [x9, #0x0]\n" - "ldr s17, [x27, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "ldr s29, [x21, #0x0]\n" - "b 186f\n" - "184:" // Height 6: Partial accumulate: partial_2_0 - "tbz x15, #1, 185f\n" - "ldr d8, [x13], #0x8\n" - "ldr d12, [x9], #0x8\n" - "ldr d16, [x27], #0x8\n" - "ldr d20, [x25], #0x8\n" - "ldr d24, [x23], #0x8\n" - "ldr d28, [x21], #0x8\n" - "mov x19, #0x8\n" - "tbz x15, #0, 186f\n" - "ld1 { v8.s }[2], [x13]\n" - "ld1 { v12.s }[2], [x9]\n" - "ld1 { v16.s }[2], [x27]\n" - "ld1 { v20.s }[2], [x25]\n" - "ld1 { v24.s }[2], [x23]\n" - "ld1 { v28.s }[2], [x21]\n" - "b 186f\n" - "185:" // Height 6: Partial accumulate: partial_1_0 - "mov x19, #0x0\n" - "ldr s8, [x13, #0x0]\n" - "ldr s12, [x9, #0x0]\n" - "ldr s16, [x27, #0x0]\n" - "ldr s20, [x25, #0x0]\n" - "ldr s24, [x23, #0x0]\n" - "ldr s28, [x21, #0x0]\n" - "186:" // Height 6: Partial accumulate: Done - "sub x13, x13, x19\n" - "sub x9, x9, x19\n" - "sub x27, x27, x19\n" - "sub x25, x25, x19\n" - "sub x23, x23, x19\n" - "sub x21, x21, x19\n" - "b 189f\n" - "187:" // Height 6: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x9, #0x0]\n" - "ldr q13, [x9, #0x10]\n" - "ldr q14, [x9, #0x20]\n" - "ldr q15, [x9, #0x30]\n" - "ldr q16, [x27, #0x0]\n" - "ldr q17, [x27, #0x10]\n" - "ldr q18, [x27, #0x20]\n" - "ldr q19, [x27, #0x30]\n" - "ldr q20, [x25, #0x0]\n" - "ldr q21, [x25, #0x10]\n" - "ldr q22, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "ldr q24, [x23, #0x0]\n" - "ldr q25, [x23, #0x10]\n" - "ldr q26, [x23, #0x20]\n" - "ldr q27, [x23, #0x30]\n" - "ldr q28, [x21, #0x0]\n" - "ldr q29, [x21, #0x10]\n" - "ldr q30, [x21, #0x20]\n" - "ldr q31, [x21, #0x30]\n" - "b 189f\n" - "188:" // Height 6: no accumulate + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "172:" // Height 6: Column loop + "tbz %x[flags], #0, 182f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "add x19, x20, x19, LSL #2\n" + "bge 181f\n" + "tbz x10, #3, 176f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "ld1 { v9.4s }, [x28], #0x10\n" + "ld1 { v13.4s }, [x23], #0x10\n" + "ld1 { v17.4s }, [x22], #0x10\n" + "ld1 { v21.4s }, [x21], #0x10\n" + "ld1 { v25.4s }, [x20], #0x10\n" + "ld1 { v29.4s }, [x19], #0x10\n" + "tbz x10, #2, 174f\n" + "ld1 { v10.4s }, [x28], #0x10\n" + "ld1 { v14.4s }, [x23], #0x10\n" + "ld1 { v18.4s }, [x22], #0x10\n" + "ld1 { v22.4s }, [x21], #0x10\n" + "ld1 { v26.4s }, [x20], #0x10\n" + "ld1 { v30.4s }, [x19], #0x10\n" + "tbz x10, #1, 173f\n" + "mov x24, #0x38\n" + "ldr d11, [x28], #0x8\n" + "ldr d15, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" + "ldr d27, [x20], #0x8\n" + "ldr d31, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v11.s }[2], [x28]\n" + "ld1 { v15.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v23.s }[2], [x21]\n" + "ld1 { v27.s }[2], [x20]\n" + "ld1 { v31.s }[2], [x19]\n" + "b 180f\n" + "173:" // Height 6: Partial accumulate: partial_1_12 + "mov x24, #0x30\n" + "tbz x10, #0, 180f\n" + "ldr s11, [x28, #0x0]\n" + "ldr s15, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s23, [x21, #0x0]\n" + "ldr s27, [x20, #0x0]\n" + "ldr s31, [x19, #0x0]\n" + "b 180f\n" + "174:" // Height 6: Partial accumulate: partial_2_8 + "tbz x10, #1, 175f\n" + "ldr d10, [x28], #0x8\n" + "ldr d14, [x23], #0x8\n" + "mov x24, #0x28\n" + "ldr d18, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" + "ldr d26, [x20], #0x8\n" + "ldr d30, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v10.s }[2], [x28]\n" + "ld1 { v14.s }[2], [x23]\n" + "ld1 { v18.s }[2], [x22]\n" + "ld1 { v22.s }[2], [x21]\n" + "ld1 { v26.s }[2], [x20]\n" + "ld1 { v30.s }[2], [x19]\n" + "b 180f\n" + "175:" // Height 6: Partial accumulate: partial_1_8 + "mov x24, #0x20\n" + "tbz x10, #0, 180f\n" + "ldr s10, [x28, #0x0]\n" + "ldr s14, [x23, #0x0]\n" + "ldr s18, [x22, #0x0]\n" + "ldr s22, [x21, #0x0]\n" + "ldr s26, [x20, #0x0]\n" + "ldr s30, [x19, #0x0]\n" + "b 180f\n" + "176:" // Height 6: Partial accumulate: partial_4_0 + "tbz x10, #2, 178f\n" + "ld1 { v8.4s }, [x28], #0x10\n" + "ld1 { v12.4s }, [x23], #0x10\n" + "ld1 { v16.4s }, [x22], #0x10\n" + "ld1 { v20.4s }, [x21], #0x10\n" + "ld1 { v24.4s }, [x20], #0x10\n" + "ld1 { v28.4s }, [x19], #0x10\n" + "tbz x10, #1, 177f\n" + "mov x24, #0x18\n" + "ldr d9, [x28], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d17, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" + "ldr d25, [x20], #0x8\n" + "ldr d29, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v9.s }[2], [x28]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v17.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" + "ld1 { v25.s }[2], [x20]\n" + "ld1 { v29.s }[2], [x19]\n" + "b 180f\n" + "177:" // Height 6: Partial accumulate: partial_1_4 + "mov x24, #0x10\n" + "tbz x10, #0, 180f\n" + "ldr s9, [x28, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s17, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" + "ldr s25, [x20, #0x0]\n" + "ldr s29, [x19, #0x0]\n" + "b 180f\n" + "178:" // Height 6: Partial accumulate: partial_2_0 + "tbz x10, #1, 179f\n" + "ldr d8, [x28], #0x8\n" + "ldr d12, [x23], #0x8\n" + "mov x24, #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" + "ldr d24, [x20], #0x8\n" + "ldr d28, [x19], #0x8\n" + "tbz x10, #0, 180f\n" + "ld1 { v8.s }[2], [x28]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" + "ld1 { v24.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x19]\n" + "b 180f\n" + "179:" // Height 6: Partial accumulate: partial_1_0 + "ldr s8, [x28, #0x0]\n" + "mov x24, #0x0\n" + "ldr s12, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" + "ldr s24, [x20, #0x0]\n" + "ldr s28, [x19, #0x0]\n" + "180:" // Height 6: Partial accumulate: Done + "sub x28, x28, x24\n" + "b 183f\n" + "181:" // Height 6: full accumulate + "ldr q8, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q10, [x28, #0x20]\n" + "ldr q11, [x28, #0x30]\n" + "ldr q12, [x23, #0x0]\n" + "ldr q13, [x23, #0x10]\n" + "ldr q14, [x23, #0x20]\n" + "ldr q15, [x23, #0x30]\n" + "ldr q16, [x22, #0x0]\n" + "ldr q17, [x22, #0x10]\n" + "ldr q18, [x22, #0x20]\n" + "ldr q19, [x22, #0x30]\n" + "ldr q20, [x21, #0x0]\n" + "ldr q21, [x21, #0x10]\n" + "ldr q22, [x21, #0x20]\n" + "ldr q23, [x21, #0x30]\n" + "ldr q24, [x20, #0x0]\n" + "ldr q25, [x20, #0x10]\n" + "ldr q26, [x20, #0x20]\n" + "ldr q27, [x20, #0x30]\n" + "ldr q28, [x19, #0x0]\n" + "ldr q29, [x19, #0x10]\n" + "ldr q30, [x19, #0x20]\n" + "ldr q31, [x19, #0x30]\n" + "b 183f\n" + "182:" // Height 6: no accumulate "movi v8.4s, #0x0\n" "movi v9.4s, #0x0\n" "movi v10.4s, #0x0\n" @@ -2738,299 +2667,299 @@ void a64_hybrid_u8u32_dot_6x16 ( "movi v29.4s, #0x0\n" "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" - "189:" // Height 6: setup done - "mov x12, #0x0\n" - "190:" // Height 6: String loop + "183:" // Height 6: setup done + "mov x27, #0x0\n" + "184:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 191f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 185f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 192f\n" + "cbnz x27, 186f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 192f\n" - "191:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "192:" // Height 6: input setup done - "cmp x11, #0x10\n" - "blt 195f\n" - "cmp x11, #0x20\n" - "blt 194f\n" - "193:" // Height 6: Multiply loop: Main loop head - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" + "b 186f\n" + "185:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "186:" // Height 6: input setup done + "cmp x26, #0x10\n" + "blt 189f\n" + "ldr q0, [x25, #0x0]\n" + "ldr q1, [x24, #0x0]\n" + "cmp x26, #0x20\n" + "ldr q2, [x23, #0x0]\n" + "ldr q3, [x22, #0x0]\n" + "ldr q4, [x21, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "ldr q6, [x9, #0x0]\n" + "blt 188f\n" + "187:" // Height 6: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "add x25, x25, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x21, x21, #0x10\n" ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" + "add x20, x20, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "add x20, x20, #0x10\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" - "sub x11, x11, #0x10\n" + "cmp x26, #0x20\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "cmp x11, #0x20\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x25, #0x0]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x24, #0x0]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x23, #0x0]\n" ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + "ldr q3, [x22, #0x0]\n" ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + "ldr q4, [x21, #0x0]\n" ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" - "bge 193b\n" - "194:" // Height 6: Multiply loop: Single iteration only - "sub x11, x11, #0x10\n" - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x28, #0x0]\n" - "ldr q2, [x26, #0x0]\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x22, #0x0]\n" "ldr q5, [x20, #0x0]\n" - "ldr q6, [x14, #0x0]\n" + "bge 187b\n" + "188:" // Height 6: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" + "sub x26, x26, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" - "prfm pldl1keep, [x10, #0x80]\n" + "add x25, x25, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x28, x28, #0x10\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "add x26, x26, #0x10\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" "add x24, x24, #0x10\n" - ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "ldr q6, [x14, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" "add x22, x22, #0x10\n" + ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "add x21, x21, #0x10\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x9, #0x20]\n" "add x20, x20, #0x10\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x40]\n" + "ldr q6, [x9, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x50]\n" + "ldr q7, [x9, #0x50]\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x60]\n" + "ldr q6, [x9, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x70]\n" + "ldr q7, [x9, #0x70]\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" - "ldr q6, [x14, #0x80]\n" + "ldr q6, [x9, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" - "ldr q7, [x14, #0x90]\n" + "ldr q7, [x9, #0x90]\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xa0]\n" + "ldr q6, [x9, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xb0]\n" + "ldr q7, [x9, #0xb0]\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" - "ldr q6, [x14, #0xc0]\n" + "ldr q6, [x9, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" - "ldr q7, [x14, #0xd0]\n" + "ldr q7, [x9, #0xd0]\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" - "ldr q6, [x14, #0xe0]\n" + "ldr q6, [x9, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" - "ldr q7, [x14, #0xf0]\n" + "ldr q7, [x9, #0xf0]\n" + "add x9, x9, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "add x14, x14, #0x100\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" @@ -3042,37 +2971,37 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" - "195:" // Height 6: Multiply loop: Main loop skip - "cbz x11, 200f\n" - "cmp x11, #0x4\n" - "blt 197f\n" - "196:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x28], #0x4\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x22], #0x4\n" + "189:" // Height 6: Multiply loop: Main loop skip + "cbz x26, 194f\n" + "cmp x26, #0x4\n" + "blt 191f\n" + "190:" // Height 6: Multiply loop: Odd block loop + "ldr s0, [x25], #0x4\n" + "sub x26, x26, #0x4\n" + "ldr s1, [x24], #0x4\n" + "cmp x26, #0x4\n" + "ldr s2, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" "ldr s5, [x20], #0x4\n" - "ldr q6, [x14, #0x0]\n" + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "sub x11, x11, #0x4\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "cmp x11, #0x4\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" @@ -3084,50 +3013,50 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" - "bge 196b\n" - "cbz x11, 200f\n" - "197:" // Height 6: Multiply loop: Skip odd blocks - "tbz x11, #1, 198f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x28], #0x2\n" - "ldr h2, [x26], #0x2\n" - "ldr h3, [x24], #0x2\n" - "ldr h4, [x22], #0x2\n" + "bge 190b\n" + "cbz x26, 194f\n" + "191:" // Height 6: Multiply loop: Skip odd blocks + "tbz x26, #1, 192f\n" + "ldr h0, [x25], #0x2\n" + "ldr h1, [x24], #0x2\n" + "ldr h2, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h4, [x21], #0x2\n" "ldr h5, [x20], #0x2\n" - "tbz x11, #0, 199f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x28]\n" - "ld1 { v2.b }[2], [x26]\n" - "ld1 { v3.b }[2], [x24]\n" - "ld1 { v4.b }[2], [x22]\n" + "tbz x26, #0, 193f\n" + "ld1 { v0.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x24]\n" + "ld1 { v2.b }[2], [x23]\n" + "ld1 { v3.b }[2], [x22]\n" + "ld1 { v4.b }[2], [x21]\n" "ld1 { v5.b }[2], [x20]\n" - "b 199f\n" - "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x28, #0x0]\n" - "ldr b2, [x26, #0x0]\n" - "ldr b3, [x24, #0x0]\n" - "ldr b4, [x22, #0x0]\n" + "b 193f\n" + "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr b0, [x25, #0x0]\n" + "ldr b1, [x24, #0x0]\n" + "ldr b2, [x23, #0x0]\n" + "ldr b3, [x22, #0x0]\n" + "ldr b4, [x21, #0x0]\n" "ldr b5, [x20, #0x0]\n" - "199:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x14, #0x0]\n" + "193:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q6, [x9, #0x0]\n" ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x14, #0x10]\n" + "ldr q7, [x9, #0x10]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x14, #0x20]\n" + "ldr q6, [x9, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x14, #0x30]\n" + "ldr q7, [x9, #0x30]\n" + "add x9, x9, #0x40\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "add x14, x14, #0x40\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" @@ -3139,195 +3068,196 @@ void a64_hybrid_u8u32_dot_6x16 ( ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" - "200:" // Height 6: Multiply loop: No odd multiplies + "194:" // Height 6: Multiply loop: No odd multiplies "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "bne 190b\n" - "prfm pstl1keep, [x13, #0x0]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "cmp x15, #0x10\n" - "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" + "add x27, x27, #0x1\n" + "cmp x27, x19\n" + "bne 184b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x28, #0x0]\n" + "cmp x10, #0x10\n" + "add x23, x28, x19, LSL #2\n" "prfm pstl1keep, [x23, #0x0]\n" + "add x22, x23, x19, LSL #2\n" + "prfm pstl1keep, [x22, #0x0]\n" + "add x21, x22, x19, LSL #2\n" "prfm pstl1keep, [x21, #0x0]\n" - "bge 209f\n" - "tbz x15, #3, 204f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v9.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v17.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "st1 { v29.4s }, [x21], #0x10\n" - "tbz x15, #2, 202f\n" - "st1 { v10.4s }, [x13], #0x10\n" - "st1 { v14.4s }, [x9], #0x10\n" - "st1 { v18.4s }, [x27], #0x10\n" - "st1 { v22.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "st1 { v30.4s }, [x21], #0x10\n" - "tbz x15, #1, 201f\n" - "str d11, [x13], #0x8\n" - "str d15, [x9], #0x8\n" - "str d19, [x27], #0x8\n" - "str d23, [x25], #0x8\n" - "str d27, [x23], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v11.s }[2], [x13]\n" - "st1 { v15.s }[2], [x9]\n" - "st1 { v19.s }[2], [x27]\n" - "st1 { v23.s }[2], [x25]\n" - "st1 { v27.s }[2], [x23]\n" - "st1 { v31.s }[2], [x21]\n" - "b 208f\n" - "201:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x15, #0, 208f\n" - "str s11, [x13, #0x0]\n" - "str s15, [x9, #0x0]\n" - "str s19, [x27, #0x0]\n" - "str s23, [x25, #0x0]\n" - "str s27, [x23, #0x0]\n" - "str s31, [x21, #0x0]\n" - "b 208f\n" - "202:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x15, #1, 203f\n" - "str d10, [x13], #0x8\n" - "str d14, [x9], #0x8\n" - "str d18, [x27], #0x8\n" - "str d22, [x25], #0x8\n" - "str d26, [x23], #0x8\n" - "str d30, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v10.s }[2], [x13]\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v18.s }[2], [x27]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v26.s }[2], [x23]\n" - "st1 { v30.s }[2], [x21]\n" - "b 208f\n" - "203:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x15, #0, 208f\n" - "str s10, [x13, #0x0]\n" - "str s14, [x9, #0x0]\n" - "str s18, [x27, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s26, [x23, #0x0]\n" - "str s30, [x21, #0x0]\n" - "b 208f\n" - "204:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x15, #2, 206f\n" - "st1 { v8.4s }, [x13], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v16.4s }, [x27], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x21], #0x10\n" - "tbz x15, #1, 205f\n" - "str d9, [x13], #0x8\n" - "str d13, [x9], #0x8\n" - "str d17, [x27], #0x8\n" - "str d21, [x25], #0x8\n" - "str d25, [x23], #0x8\n" - "str d29, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v9.s }[2], [x13]\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v17.s }[2], [x27]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v25.s }[2], [x23]\n" - "st1 { v29.s }[2], [x21]\n" - "b 208f\n" - "205:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x15, #0, 208f\n" - "str s9, [x13, #0x0]\n" - "str s13, [x9, #0x0]\n" - "str s17, [x27, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s25, [x23, #0x0]\n" - "str s29, [x21, #0x0]\n" - "b 208f\n" - "206:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x15, #1, 207f\n" - "str d8, [x13], #0x8\n" - "str d12, [x9], #0x8\n" - "str d16, [x27], #0x8\n" - "str d20, [x25], #0x8\n" - "str d24, [x23], #0x8\n" - "str d28, [x21], #0x8\n" - "tbz x15, #0, 208f\n" - "st1 { v8.s }[2], [x13]\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v16.s }[2], [x27]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v24.s }[2], [x23]\n" - "st1 { v28.s }[2], [x21]\n" - "b 208f\n" - "207:" // Height 6: Partial direct writeback: partial_1_0 - "str s8, [x13, #0x0]\n" - "str s12, [x9, #0x0]\n" - "str s16, [x27, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s24, [x23, #0x0]\n" - "str s28, [x21, #0x0]\n" - "208:" // Height 6: Partial direct writeback: Done - "b 210f\n" - "209:" // Height 6: Full writeback - "str q8, [x13, #0x0]\n" - "str q9, [x13, #0x10]\n" - "str q10, [x13, #0x20]\n" - "str q11, [x13, #0x30]\n" - "str q12, [x9, #0x0]\n" - "str q13, [x9, #0x10]\n" - "str q14, [x9, #0x20]\n" - "str q15, [x9, #0x30]\n" - "str q16, [x27, #0x0]\n" - "str q17, [x27, #0x10]\n" - "str q18, [x27, #0x20]\n" - "str q19, [x27, #0x30]\n" - "str q20, [x25, #0x0]\n" - "str q21, [x25, #0x10]\n" - "str q22, [x25, #0x20]\n" - "str q23, [x25, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "str q28, [x21, #0x0]\n" - "str q29, [x21, #0x10]\n" - "str q30, [x21, #0x20]\n" - "str q31, [x21, #0x30]\n" - "add x13, x13, #0x40\n" - "add x9, x9, #0x40\n" - "add x27, x27, #0x40\n" - "add x25, x25, #0x40\n" - "add x23, x23, #0x40\n" - "add x21, x21, #0x40\n" - "210:" // Height 6: Writeback done - "subs x15, x15, #0x10\n" - "bgt 178b\n" + "add x20, x21, x19, LSL #2\n" + "prfm pstl1keep, [x20, #0x0]\n" + "add x19, x20, x19, LSL #2\n" + "prfm pstl1keep, [x19, #0x0]\n" + "bge 203f\n" + "tbz x10, #3, 198f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v9.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v17.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v21.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v25.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "st1 { v29.4s }, [x19], #0x10\n" + "tbz x10, #2, 196f\n" + "st1 { v10.4s }, [x28], #0x10\n" + "st1 { v14.4s }, [x23], #0x10\n" + "st1 { v18.4s }, [x22], #0x10\n" + "st1 { v22.4s }, [x21], #0x10\n" + "st1 { v26.4s }, [x20], #0x10\n" + "st1 { v30.4s }, [x19], #0x10\n" + "tbz x10, #1, 195f\n" + "str d11, [x28], #0x8\n" + "str d15, [x23], #0x8\n" + "str d19, [x22], #0x8\n" + "str d23, [x21], #0x8\n" + "str d27, [x20], #0x8\n" + "str d31, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v11.s }[2], [x28]\n" + "st1 { v15.s }[2], [x23]\n" + "st1 { v19.s }[2], [x22]\n" + "st1 { v23.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20]\n" + "st1 { v31.s }[2], [x19]\n" + "b 202f\n" + "195:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x10, #0, 202f\n" + "str s11, [x28, #0x0]\n" + "str s15, [x23, #0x0]\n" + "str s19, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s27, [x20, #0x0]\n" + "str s31, [x19, #0x0]\n" + "b 202f\n" + "196:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x10, #1, 197f\n" + "str d10, [x28], #0x8\n" + "str d14, [x23], #0x8\n" + "str d18, [x22], #0x8\n" + "str d22, [x21], #0x8\n" + "str d26, [x20], #0x8\n" + "str d30, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v10.s }[2], [x28]\n" + "st1 { v14.s }[2], [x23]\n" + "st1 { v18.s }[2], [x22]\n" + "st1 { v22.s }[2], [x21]\n" + "st1 { v26.s }[2], [x20]\n" + "st1 { v30.s }[2], [x19]\n" + "b 202f\n" + "197:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x10, #0, 202f\n" + "str s10, [x28, #0x0]\n" + "str s14, [x23, #0x0]\n" + "str s18, [x22, #0x0]\n" + "str s22, [x21, #0x0]\n" + "str s26, [x20, #0x0]\n" + "str s30, [x19, #0x0]\n" + "b 202f\n" + "198:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x10, #2, 200f\n" + "st1 { v8.4s }, [x28], #0x10\n" + "st1 { v12.4s }, [x23], #0x10\n" + "st1 { v16.4s }, [x22], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v24.4s }, [x20], #0x10\n" + "st1 { v28.4s }, [x19], #0x10\n" + "tbz x10, #1, 199f\n" + "str d9, [x28], #0x8\n" + "str d13, [x23], #0x8\n" + "str d17, [x22], #0x8\n" + "str d21, [x21], #0x8\n" + "str d25, [x20], #0x8\n" + "str d29, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v9.s }[2], [x28]\n" + "st1 { v13.s }[2], [x23]\n" + "st1 { v17.s }[2], [x22]\n" + "st1 { v21.s }[2], [x21]\n" + "st1 { v25.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19]\n" + "b 202f\n" + "199:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x10, #0, 202f\n" + "str s9, [x28, #0x0]\n" + "str s13, [x23, #0x0]\n" + "str s17, [x22, #0x0]\n" + "str s21, [x21, #0x0]\n" + "str s25, [x20, #0x0]\n" + "str s29, [x19, #0x0]\n" + "b 202f\n" + "200:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x10, #1, 201f\n" + "str d8, [x28], #0x8\n" + "str d12, [x23], #0x8\n" + "str d16, [x22], #0x8\n" + "str d20, [x21], #0x8\n" + "str d24, [x20], #0x8\n" + "str d28, [x19], #0x8\n" + "tbz x10, #0, 202f\n" + "st1 { v8.s }[2], [x28]\n" + "st1 { v12.s }[2], [x23]\n" + "st1 { v16.s }[2], [x22]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v24.s }[2], [x20]\n" + "st1 { v28.s }[2], [x19]\n" + "b 202f\n" + "201:" // Height 6: Partial direct writeback: partial_1_0 + "str s8, [x28, #0x0]\n" + "str s12, [x23, #0x0]\n" + "str s16, [x22, #0x0]\n" + "str s20, [x21, #0x0]\n" + "str s24, [x20, #0x0]\n" + "str s28, [x19, #0x0]\n" + "202:" // Height 6: Partial direct writeback: Done + "b 204f\n" + "203:" // Height 6: Full writeback + "str q8, [x28, #0x0]\n" + "str q9, [x28, #0x10]\n" + "str q10, [x28, #0x20]\n" + "str q11, [x28, #0x30]\n" + "add x28, x28, #0x40\n" + "str q12, [x23, #0x0]\n" + "str q13, [x23, #0x10]\n" + "str q14, [x23, #0x20]\n" + "str q15, [x23, #0x30]\n" + "str q16, [x22, #0x0]\n" + "str q17, [x22, #0x10]\n" + "str q18, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "str q20, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q22, [x21, #0x20]\n" + "str q23, [x21, #0x30]\n" + "str q24, [x20, #0x0]\n" + "str q25, [x20, #0x10]\n" + "str q26, [x20, #0x20]\n" + "str q27, [x20, #0x30]\n" + "str q28, [x19, #0x0]\n" + "str q29, [x19, #0x10]\n" + "str q30, [x19, #0x20]\n" + "str q31, [x19, #0x30]\n" + "204:" // Height 6: Writeback done + "subs x10, x10, #0x10\n" + "bgt 172b\n" "subs %x[M], %x[M], #0x6\n" - "beq 212f\n" + "beq 206f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 211f\n" + "tbz %x[flags], #3, 205f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "211:" // Update direct input + "205:" // Update direct input "mov x19, #0x6\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "212:" // Exit + "206:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp index 63fca129ba..066bff4602 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp @@ -37,7 +37,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST ); @@ -74,7 +73,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL; - cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp index f5445e72e9..1233a98531 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp @@ -28,6 +28,7 @@ #include "../../bfloat.hpp" #include +#include namespace arm_gemm { @@ -95,164 +96,158 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ptrue p5.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 71f\n" + "bge 66f\n" "cmp %x[M], #0x4\n" - "bgt 57f\n" - "beq 43f\n" + "bgt 53f\n" + "beq 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 27f\n" + "beq 14f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 4f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "b 6f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 5f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "b 6f\n" - "5:" // Height 1: no accumulate + "whilelt p1.s, x19, x11\n" + "cbz x9, 3f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" "mov z11.b, #0x0\n" - "6:" // Height 1: setup done - "mov x12, #0x0\n" - "7:" // Height 1: String loop + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 8f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 9f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "b 9f\n" - "8:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "9:" // Height 1: input setup done - "cmp x11, #0x8\n" - "ble 11f\n" - "10:" // Height 1: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #1\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x26, #0x8\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "cmp x11, #0x8\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "cmp x26, #0x8\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - "bgt 10b\n" - "11:" // Height 1: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - "12:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" + "11:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 7b\n" - "tbz %x[flags], #1, 13f\n" + "cmp x27, x19\n" + "bne 6b\n" + "tbz %x[flags], #1, 12f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -265,63 +260,56 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "fmax z9.s, p5/M, z9.s, z1.s\n" "fmax z10.s, p5/M, z10.s, z1.s\n" "fmax z11.s, p5/M, z11.s, z1.s\n" - "13:" // Height 1: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "14:" // Height 1: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 3b\n" - "b 86f\n" - "15:" // Height 2 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 16f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 17f\n" - "16:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "17:" // Height 2: Column loop + "12:" // Height 1: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "13:" // Height 1: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "15:" // Height 2: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 18f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 16f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z13.d, z9.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z14.d, z10.d\n" "mov z15.d, z11.d\n" - "b 20f\n" - "18:" // Height 2: no bias - "tbz %x[flags], #0, 19f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "b 20f\n" - "19:" // Height 2: no accumulate + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "b 18f\n" + "17:" // Height 2: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -330,160 +318,162 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z13.b, #0x0\n" "mov z14.b, #0x0\n" "mov z15.b, #0x0\n" - "20:" // Height 2: setup done - "mov x12, #0x0\n" - "21:" // Height 2: String loop + "18:" // Height 2: setup done + "mov x27, #0x0\n" + "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 22f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 23f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 21f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "b 23f\n" - "22:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "23:" // Height 2: input setup done - "cmp x11, #0x8\n" - "ble 25f\n" - "24:" // Height 2: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "21:" // Height 2: input setup done + "cmp x26, #0x8\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "cmp x11, #0x8\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "cmp x26, #0x8\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - "bgt 24b\n" - "25:" // Height 2: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - "26:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" + "24:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 21b\n" - "tbz %x[flags], #1, 27f\n" + "cmp x27, x19\n" + "bne 19b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "tbz %x[flags], #1, 25f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -504,79 +494,69 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "fmax z13.s, p5/M, z13.s, z1.s\n" "fmax z14.s, p5/M, z14.s, z1.s\n" "fmax z15.s, p5/M, z15.s, z1.s\n" - "27:" // Height 2: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "28:" // Height 2: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 17b\n" - "b 86f\n" - "29:" // Height 3 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 30f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 31f\n" - "30:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "31:" // Height 3: Column loop + "25:" // Height 2: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "26:" // Height 2: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "28:" // Height 3: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 32f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 29f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" "mov z13.d, z9.d\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" "mov z15.d, z11.d\n" "mov z18.d, z10.d\n" "mov z19.d, z11.d\n" - "b 34f\n" - "32:" // Height 3: no bias - "tbz %x[flags], #0, 33f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "b 34f\n" - "33:" // Height 3: no accumulate + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 31f\n" + "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -589,201 +569,204 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z17.b, #0x0\n" "mov z18.b, #0x0\n" "mov z19.b, #0x0\n" - "34:" // Height 3: setup done - "mov x12, #0x0\n" - "35:" // Height 3: String loop + "31:" // Height 3: setup done + "mov x27, #0x0\n" + "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 36f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 37f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 34f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" - "b 37f\n" - "36:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "37:" // Height 3: input setup done - "cmp x11, #0x8\n" - "ble 39f\n" - "38:" // Height 3: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "34:" // Height 3: input setup done + "cmp x26, #0x8\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "cmp x11, #0x8\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "cmp x26, #0x8\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - "bgt 38b\n" - "39:" // Height 3: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "add x26, x26, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - "40:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "37:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 35b\n" - "tbz %x[flags], #1, 41f\n" + "cmp x27, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "tbz %x[flags], #1, 38f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -812,65 +795,48 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "fmax z17.s, p5/M, z17.s, z1.s\n" "fmax z18.s, p5/M, z18.s, z1.s\n" "fmax z19.s, p5/M, z19.s, z1.s\n" - "41:" // Height 3: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "42:" // Height 3: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 31b\n" - "b 86f\n" - "43:" // Height 4 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 44f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 45f\n" - "44:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "45:" // Height 4: Column loop + "38:" // Height 3: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "39:" // Height 3: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "41:" // Height 4: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 46f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 42f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -880,27 +846,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z21.d, z9.d\n" "mov z22.d, z10.d\n" "mov z23.d, z11.d\n" - "b 48f\n" - "46:" // Height 4: no bias - "tbz %x[flags], #0, 47f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "b 48f\n" - "47:" // Height 4: no accumulate + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "b 44f\n" + "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -917,123 +887,123 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z21.b, #0x0\n" "mov z22.b, #0x0\n" "mov z23.b, #0x0\n" - "48:" // Height 4: setup done - "mov x12, #0x0\n" - "49:" // Height 4: String loop + "44:" // Height 4: setup done + "mov x27, #0x0\n" + "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 50f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 51f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 47f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" - "b 51f\n" - "50:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "51:" // Height 4: input setup done - "cmp x11, #0x8\n" - "ble 53f\n" - "52:" // Height 4: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "47:" // Height 4: input setup done + "cmp x26, #0x8\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x8\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" @@ -1042,31 +1012,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" - "bgt 52b\n" - "53:" // Height 4: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "add x24, x24, #0x10\n" + "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -1075,21 +1045,21 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" @@ -1098,21 +1068,21 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" @@ -1121,20 +1091,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" @@ -1143,16 +1113,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" - "54:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "50:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 49b\n" - "tbz %x[flags], #1, 55f\n" + "cmp x27, x19\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "tbz %x[flags], #1, 51f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1189,73 +1163,52 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "fmax z21.s, p5/M, z21.s, z1.s\n" "fmax z22.s, p5/M, z22.s, z1.s\n" "fmax z23.s, p5/M, z23.s, z1.s\n" - "55:" // Height 4: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "56:" // Height 4: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 45b\n" - "b 86f\n" - "57:" // Height 5 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 58f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 59f\n" - "58:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "59:" // Height 5: Column loop + "51:" // Height 4: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x22]\n" + "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "52:" // Height 4: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "54:" // Height 5: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 60f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 55f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -1269,31 +1222,36 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z25.d, z9.d\n" "mov z26.d, z10.d\n" "mov z27.d, z11.d\n" - "b 62f\n" - "60:" // Height 5: no bias - "tbz %x[flags], #0, 61f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "b 62f\n" - "61:" // Height 5: no accumulate + "b 57f\n" + "55:" // Height 5: no bias + "tbz %x[flags], #0, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 57f\n" + "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -1314,143 +1272,143 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" "mov z27.b, #0x0\n" - "62:" // Height 5: setup done - "mov x12, #0x0\n" - "63:" // Height 5: String loop + "57:" // Height 5: setup done + "mov x27, #0x0\n" + "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 64f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 65f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 60f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" - "b 65f\n" - "64:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "65:" // Height 5: input setup done - "cmp x11, #0x8\n" - "ble 67f\n" - "66:" // Height 5: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x21, x21, x19, LSL #1\n" + "b 60f\n" + "59:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "60:" // Height 5: input setup done + "cmp x26, #0x8\n" + "ble 62f\n" + "61:" // Height 5: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x8\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" @@ -1461,35 +1419,35 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" - "bgt 66b\n" - "67:" // Height 5: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 61b\n" + "62:" // Height 5: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + "add x21, x21, #0x10\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -1500,23 +1458,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" @@ -1527,23 +1485,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" @@ -1554,22 +1512,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" @@ -1580,17 +1538,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" - "68:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "63:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 63b\n" - "tbz %x[flags], #1, 69f\n" + "cmp x27, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "tbz %x[flags], #1, 64f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1635,83 +1598,59 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "fmax z25.s, p5/M, z25.s, z1.s\n" "fmax z26.s, p5/M, z26.s, z1.s\n" "fmax z27.s, p5/M, z27.s, z1.s\n" - "69:" // Height 5: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "70:" // Height 5: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 59b\n" - "b 86f\n" - "71:" // Height 6 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 73f\n" - "72:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "73:" // Height 6: Column loop + "64:" // Height 5: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x22]\n" + "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x21]\n" + "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "65:" // Height 5: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "67:" // Height 6: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 74f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 68f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -1729,35 +1668,41 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z29.d, z9.d\n" "mov z30.d, z10.d\n" "mov z31.d, z11.d\n" - "b 76f\n" - "74:" // Height 6: no bias - "tbz %x[flags], #0, 75f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" - "b 76f\n" - "75:" // Height 6: no accumulate + "b 70f\n" + "68:" // Height 6: no bias + "tbz %x[flags], #0, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" + "b 70f\n" + "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -1782,77 +1727,77 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "mov z29.b, #0x0\n" "mov z30.b, #0x0\n" "mov z31.b, #0x0\n" - "76:" // Height 6: setup done - "mov x12, #0x0\n" - "77:" // Height 6: String loop + "70:" // Height 6: setup done + "mov x27, #0x0\n" + "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 78f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 72f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 79f\n" + "cbnz x27, 73f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" "add x20, x20, x19, LSL #1\n" - "b 79f\n" - "78:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "add x20, x22, x19, LSL #1\n" - "79:" // Height 6: input setup done - "cmp x11, #0x8\n" - "ble 81f\n" - "80:" // Height 6: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "b 73f\n" + "72:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "73:" // Height 6: input setup done + "cmp x26, #0x8\n" + "ble 75f\n" + "74:" // Height 6: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "add x20, x20, #0x10\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x8\n" ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" "prfm pldl1keep, [x20, #0x80]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" @@ -1860,85 +1805,85 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" @@ -1951,39 +1896,39 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" - "bgt 80b\n" - "81:" // Height 6: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 74b\n" + "75:" // Height 6: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" "add x20, x20, #0x10\n" ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" @@ -1996,25 +1941,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" @@ -2027,25 +1972,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x2\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x2\n" ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" @@ -2058,24 +2003,24 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" @@ -2088,18 +2033,24 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" - "82:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "76:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 77b\n" - "tbz %x[flags], #1, 83f\n" + "cmp x27, x19\n" + "bne 71b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "tbz %x[flags], #1, 77f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2152,57 +2103,52 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "fmax z29.s, p5/M, z29.s, z1.s\n" "fmax z30.s, p5/M, z30.s, z1.s\n" "fmax z31.s, p5/M, z31.s, z1.s\n" - "83:" // Height 6: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "st1w { z28.s }, p4, [x21]\n" - "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" - "addvl x21, x21, #4\n" - "84:" // Height 6: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 73b\n" + "77:" // Height 6: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x22]\n" + "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x21]\n" + "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z28.s }, p4, [x20]\n" + "st1w { z29.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x20, #3, MUL VL]\n" + "78:" // Height 6: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 67b\n" "subs %x[M], %x[M], #0x6\n" - "beq 86f\n" + "beq 80f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 85f\n" + "tbz %x[flags], #3, 79f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "85:" // Update direct input + "79:" // Update direct input "mov x19, #0xc\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "86:" // Exit + "80:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp index aa74ce9a73..5c8563952f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_fp16_mla_6x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp16_mla_6x4VL; - cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp index bb42dc0e04..7cc03bbfb5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp @@ -27,6 +27,7 @@ #include "../../utils.hpp" #include +#include namespace arm_gemm { @@ -94,241 +95,235 @@ void sve_hybrid_fp16_mla_6x4VL ( "ptrue p5.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 71f\n" + "bge 66f\n" "cmp %x[M], #0x4\n" - "bgt 57f\n" - "beq 43f\n" + "bgt 53f\n" + "beq 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 27f\n" + "beq 14f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov x19, #0x0\n" - "whilelt p4.h, x19, x16\n" + "whilelt p4.h, x19, x11\n" "inch x19\n" - "whilelt p3.h, x19, x16\n" + "whilelt p3.h, x19, x11\n" "inch x19\n" - "whilelt p2.h, x19, x16\n" + "whilelt p2.h, x19, x11\n" "inch x19\n" - "whilelt p1.h, x19, x16\n" - "cbz x14, 4f\n" - "ld1h { z8.h }, p5/Z, [x14]\n" - "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" - "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" - "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "b 6f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 5f\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" - "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "b 6f\n" - "5:" // Height 1: no accumulate + "whilelt p1.h, x19, x11\n" + "cbz x9, 3f\n" + "ld1h { z8.h }, p5/Z, [x9]\n" + "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n" + "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1h { z8.h }, p4/Z, [x28]\n" + "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" "mov z11.b, #0x0\n" - "6:" // Height 1: setup done - "mov x12, #0x0\n" - "7:" // Height 1: String loop + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 8f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 9f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "b 9f\n" - "8:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "9:" // Height 1: input setup done - "cmp x11, #0x8\n" - "ble 11f\n" - "10:" // Height 1: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #1\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x26, #0x8\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "cmp x11, #0x8\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "cmp x26, #0x8\n" "fmla z10.h, z6.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" - "bgt 10b\n" - "11:" // Height 1: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z11.h, z7.h, z0.h[0]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[1]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.h, z7.h, z0.h[1]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.h, z7.h, z0.h[2]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.h, z7.h, z0.h[3]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.h, z7.h, z0.h[4]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[5]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.h, z7.h, z0.h[5]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.h, z7.h, z0.h[6]\n" - "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" - "12:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" + "11:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 7b\n" - "tbz %x[flags], #1, 13f\n" + "cmp x27, x19\n" + "bne 6b\n" + "tbz %x[flags], #1, 12f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rh { z1.h }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -341,63 +336,56 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmax z9.h, p5/M, z9.h, z1.h\n" "fmax z10.h, p5/M, z10.h, z1.h\n" "fmax z11.h, p5/M, z11.h, z1.h\n" - "13:" // Height 1: No activation - "st1h { z8.h }, p4, [x13]\n" - "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" - "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" - "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "14:" // Height 1: Writeback done - "dech x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 3b\n" - "b 86f\n" - "15:" // Height 2 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 16f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #1\n" - "b 17f\n" - "16:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "17:" // Height 2: Column loop + "12:" // Height 1: No activation + "st1h { z8.h }, p4, [x28]\n" + "st1h { z9.h }, p3, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "13:" // Height 1: Writeback done + "dech x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "15:" // Height 2: Column loop "mov x19, #0x0\n" - "whilelt p4.h, x19, x16\n" + "whilelt p4.h, x19, x11\n" "inch x19\n" - "whilelt p3.h, x19, x16\n" + "whilelt p3.h, x19, x11\n" "inch x19\n" - "whilelt p2.h, x19, x16\n" + "whilelt p2.h, x19, x11\n" "inch x19\n" - "whilelt p1.h, x19, x16\n" - "cbz x14, 18f\n" - "ld1h { z8.h }, p5/Z, [x14]\n" + "whilelt p1.h, x19, x11\n" + "cbz x9, 16f\n" + "ld1h { z8.h }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" - "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n" "mov z13.d, z9.d\n" - "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z14.d, z10.d\n" "mov z15.d, z11.d\n" - "b 20f\n" - "18:" // Height 2: no bias - "tbz %x[flags], #0, 19f\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" - "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x9]\n" - "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" - "b 20f\n" - "19:" // Height 2: no accumulate + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1h { z8.h }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #1\n" + "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n" + "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "b 18f\n" + "17:" // Height 2: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -406,269 +394,271 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z13.b, #0x0\n" "mov z14.b, #0x0\n" "mov z15.b, #0x0\n" - "20:" // Height 2: setup done - "mov x12, #0x0\n" - "21:" // Height 2: String loop + "18:" // Height 2: setup done + "mov x27, #0x0\n" + "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 22f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 23f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 21f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "b 23f\n" - "22:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "23:" // Height 2: input setup done - "cmp x11, #0x8\n" - "ble 25f\n" - "24:" // Height 2: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "21:" // Height 2: input setup done + "cmp x26, #0x8\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "cmp x11, #0x8\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "cmp x26, #0x8\n" "fmla z13.h, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" "fmla z12.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[2]\n" "fmla z12.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[3]\n" "fmla z12.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z8.h, z6.h, z0.h[4]\n" "fmla z12.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[5]\n" "fmla z12.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[6]\n" "fmla z12.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[7]\n" "fmla z12.h, z6.h, z1.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" "fmla z15.h, z7.h, z1.h[7]\n" - "bgt 24b\n" - "25:" // Height 2: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z13.h, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" - "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.h, z6.h, z1.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" "fmla z15.h, z7.h, z1.h[7]\n" - "26:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" + "24:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 21b\n" - "tbz %x[flags], #1, 27f\n" + "cmp x27, x19\n" + "bne 19b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "tbz %x[flags], #1, 25f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rh { z1.h }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -689,79 +679,69 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmax z13.h, p5/M, z13.h, z1.h\n" "fmax z14.h, p5/M, z14.h, z1.h\n" "fmax z15.h, p5/M, z15.h, z1.h\n" - "27:" // Height 2: No activation - "st1h { z8.h }, p4, [x13]\n" - "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" - "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" - "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1h { z12.h }, p4, [x9]\n" - "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" - "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" - "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "28:" // Height 2: Writeback done - "dech x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 17b\n" - "b 86f\n" - "29:" // Height 3 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 30f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "add x27, x27, x19, LSL #1\n" - "b 31f\n" - "30:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "31:" // Height 3: Column loop + "25:" // Height 2: No activation + "st1h { z8.h }, p4, [x28]\n" + "st1h { z9.h }, p3, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p4, [x24]\n" + "st1h { z13.h }, p3, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x24, #3, MUL VL]\n" + "26:" // Height 2: Writeback done + "dech x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "28:" // Height 3: Column loop "mov x19, #0x0\n" - "whilelt p4.h, x19, x16\n" + "whilelt p4.h, x19, x11\n" "inch x19\n" - "whilelt p3.h, x19, x16\n" + "whilelt p3.h, x19, x11\n" "inch x19\n" - "whilelt p2.h, x19, x16\n" + "whilelt p2.h, x19, x11\n" "inch x19\n" - "whilelt p1.h, x19, x16\n" - "cbz x14, 32f\n" - "ld1h { z8.h }, p5/Z, [x14]\n" + "whilelt p1.h, x19, x11\n" + "cbz x9, 29f\n" + "ld1h { z8.h }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" - "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n" + "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n" "mov z13.d, z9.d\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" "mov z15.d, z11.d\n" "mov z18.d, z10.d\n" "mov z19.d, z11.d\n" - "b 34f\n" - "32:" // Height 3: no bias - "tbz %x[flags], #0, 33f\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" - "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x9]\n" - "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x27]\n" - "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" - "b 34f\n" - "33:" // Height 3: no accumulate + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1h { z8.h }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #1\n" + "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #1\n" + "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x23]\n" + "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n" + "b 31f\n" + "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -774,342 +754,345 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z17.b, #0x0\n" "mov z18.b, #0x0\n" "mov z19.b, #0x0\n" - "34:" // Height 3: setup done - "mov x12, #0x0\n" - "35:" // Height 3: String loop + "31:" // Height 3: setup done + "mov x27, #0x0\n" + "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 36f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 37f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 34f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" - "b 37f\n" - "36:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "37:" // Height 3: input setup done - "cmp x11, #0x8\n" - "ble 39f\n" - "38:" // Height 3: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #1\n" + "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "34:" // Height 3: input setup done + "cmp x26, #0x8\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" - "cmp x11, #0x8\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "cmp x26, #0x8\n" "fmla z13.h, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z17.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" "fmla z19.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" "fmla z19.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[2]\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" "fmla z19.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[3]\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" "fmla z19.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z8.h, z6.h, z0.h[4]\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" "fmla z19.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[5]\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" "fmla z19.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[6]\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" "fmla z19.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[7]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" "fmla z15.h, z7.h, z1.h[7]\n" "fmla z19.h, z7.h, z2.h[7]\n" - "bgt 38b\n" - "39:" // Height 3: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "add x26, x26, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "fmla z16.h, z6.h, z2.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" "fmla z19.h, z7.h, z2.h[0]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" "fmla z19.h, z7.h, z2.h[1]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" "fmla z19.h, z7.h, z2.h[2]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" "fmla z19.h, z7.h, z2.h[3]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" "fmla z19.h, z7.h, z2.h[4]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" "fmla z19.h, z7.h, z2.h[5]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" "fmla z19.h, z7.h, z2.h[6]\n" - "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" "fmla z11.h, z7.h, z0.h[7]\n" "fmla z15.h, z7.h, z1.h[7]\n" "fmla z19.h, z7.h, z2.h[7]\n" - "40:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "37:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 35b\n" - "tbz %x[flags], #1, 41f\n" + "cmp x27, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "tbz %x[flags], #1, 38f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rh { z1.h }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1138,65 +1121,48 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmax z17.h, p5/M, z17.h, z1.h\n" "fmax z18.h, p5/M, z18.h, z1.h\n" "fmax z19.h, p5/M, z19.h, z1.h\n" - "41:" // Height 3: No activation - "st1h { z8.h }, p4, [x13]\n" - "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" - "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" - "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1h { z12.h }, p4, [x9]\n" - "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" - "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" - "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1h { z16.h }, p4, [x27]\n" - "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" - "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" - "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "42:" // Height 3: Writeback done - "dech x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 31b\n" - "b 86f\n" - "43:" // Height 4 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 44f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #1\n" - "add x25, x25, x19, LSL #1\n" - "b 45f\n" - "44:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "add x25, x27, x19, LSL #1\n" - "45:" // Height 4: Column loop + "38:" // Height 3: No activation + "st1h { z8.h }, p4, [x28]\n" + "st1h { z9.h }, p3, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p4, [x24]\n" + "st1h { z13.h }, p3, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p4, [x23]\n" + "st1h { z17.h }, p3, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x23, #3, MUL VL]\n" + "39:" // Height 3: Writeback done + "dech x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "41:" // Height 4: Column loop "mov x19, #0x0\n" - "whilelt p4.h, x19, x16\n" + "whilelt p4.h, x19, x11\n" "inch x19\n" - "whilelt p3.h, x19, x16\n" + "whilelt p3.h, x19, x11\n" "inch x19\n" - "whilelt p2.h, x19, x16\n" + "whilelt p2.h, x19, x11\n" "inch x19\n" - "whilelt p1.h, x19, x16\n" - "cbz x14, 46f\n" - "ld1h { z8.h }, p5/Z, [x14]\n" + "whilelt p1.h, x19, x11\n" + "cbz x9, 42f\n" + "ld1h { z8.h }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -1206,27 +1172,31 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z21.d, z9.d\n" "mov z22.d, z10.d\n" "mov z23.d, z11.d\n" - "b 48f\n" - "46:" // Height 4: no bias - "tbz %x[flags], #0, 47f\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" - "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x9]\n" - "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x27]\n" - "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x25]\n" - "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n" - "b 48f\n" - "47:" // Height 4: no accumulate + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1h { z8.h }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #1\n" + "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #1\n" + "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #1\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x23]\n" + "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x22]\n" + "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n" + "b 44f\n" + "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -1243,204 +1213,204 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z21.b, #0x0\n" "mov z22.b, #0x0\n" "mov z23.b, #0x0\n" - "48:" // Height 4: setup done - "mov x12, #0x0\n" - "49:" // Height 4: String loop + "44:" // Height 4: setup done + "mov x27, #0x0\n" + "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 50f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 51f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 47f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" - "b 51f\n" - "50:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "51:" // Height 4: input setup done - "cmp x11, #0x8\n" - "ble 53f\n" - "52:" // Height 4: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x23, x23, x19, LSL #1\n" + "add x22, x22, x19, LSL #1\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "47:" // Height 4: input setup done + "cmp x26, #0x8\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x8\n" "fmla z20.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.h, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" "fmla z22.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" "fmla z19.h, z7.h, z2.h[0]\n" "fmla z23.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" "fmla z20.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" "fmla z21.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" "fmla z22.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" "fmla z19.h, z7.h, z2.h[1]\n" "fmla z23.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[2]\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" "fmla z20.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" "fmla z21.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" "fmla z22.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" "fmla z19.h, z7.h, z2.h[2]\n" "fmla z23.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[3]\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" "fmla z20.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" "fmla z21.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" "fmla z22.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" "fmla z19.h, z7.h, z2.h[3]\n" "fmla z23.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z8.h, z6.h, z0.h[4]\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" "fmla z20.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" "fmla z21.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" "fmla z22.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" "fmla z19.h, z7.h, z2.h[4]\n" "fmla z23.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[5]\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" "fmla z20.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" "fmla z21.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" "fmla z22.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" "fmla z19.h, z7.h, z2.h[5]\n" "fmla z23.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[6]\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" "fmla z20.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" "fmla z21.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" "fmla z22.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" "fmla z19.h, z7.h, z2.h[6]\n" "fmla z23.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[7]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" "fmla z20.h, z6.h, z3.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" "fmla z21.h, z7.h, z3.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" @@ -1449,31 +1419,31 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[7]\n" "fmla z19.h, z7.h, z2.h[7]\n" "fmla z23.h, z7.h, z3.h[7]\n" - "bgt 52b\n" - "53:" // Height 4: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "add x24, x24, #0x10\n" + "add x22, x22, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "fmla z17.h, z7.h, z2.h[0]\n" "fmla z20.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z21.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" @@ -1482,21 +1452,21 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[0]\n" "fmla z19.h, z7.h, z2.h[0]\n" "fmla z23.h, z7.h, z3.h[0]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" "fmla z20.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" "fmla z21.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" @@ -1505,21 +1475,21 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[1]\n" "fmla z19.h, z7.h, z2.h[1]\n" "fmla z23.h, z7.h, z3.h[1]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" "fmla z20.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" "fmla z21.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" @@ -1528,21 +1498,21 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[2]\n" "fmla z19.h, z7.h, z2.h[2]\n" "fmla z23.h, z7.h, z3.h[2]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" "fmla z20.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" "fmla z21.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" @@ -1551,21 +1521,21 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[3]\n" "fmla z19.h, z7.h, z2.h[3]\n" "fmla z23.h, z7.h, z3.h[3]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" "fmla z20.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" "fmla z21.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" @@ -1574,21 +1544,21 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[4]\n" "fmla z19.h, z7.h, z2.h[4]\n" "fmla z23.h, z7.h, z3.h[4]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" "fmla z20.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" "fmla z21.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" @@ -1597,21 +1567,21 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[5]\n" "fmla z19.h, z7.h, z2.h[5]\n" "fmla z23.h, z7.h, z3.h[5]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" "fmla z20.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" "fmla z21.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" @@ -1620,20 +1590,20 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[6]\n" "fmla z19.h, z7.h, z2.h[6]\n" "fmla z23.h, z7.h, z3.h[6]\n" - "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" "fmla z20.h, z6.h, z3.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" "fmla z21.h, z7.h, z3.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" @@ -1642,16 +1612,20 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z15.h, z7.h, z1.h[7]\n" "fmla z19.h, z7.h, z2.h[7]\n" "fmla z23.h, z7.h, z3.h[7]\n" - "54:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "50:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 49b\n" - "tbz %x[flags], #1, 55f\n" + "cmp x27, x19\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "tbz %x[flags], #1, 51f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rh { z1.h }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1688,73 +1662,52 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmax z21.h, p5/M, z21.h, z1.h\n" "fmax z22.h, p5/M, z22.h, z1.h\n" "fmax z23.h, p5/M, z23.h, z1.h\n" - "55:" // Height 4: No activation - "st1h { z8.h }, p4, [x13]\n" - "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" - "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" - "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1h { z12.h }, p4, [x9]\n" - "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" - "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" - "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1h { z16.h }, p4, [x27]\n" - "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" - "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" - "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1h { z20.h }, p4, [x25]\n" - "st1h { z21.h }, p3, [x25, #1, MUL VL]\n" - "st1h { z22.h }, p2, [x25, #2, MUL VL]\n" - "st1h { z23.h }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "56:" // Height 4: Writeback done - "dech x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 45b\n" - "b 86f\n" - "57:" // Height 5 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 58f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #1\n" - "add x25, x25, x19, LSL #1\n" - "add x23, x23, x19, LSL #1\n" - "b 59f\n" - "58:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "add x25, x27, x19, LSL #1\n" - "add x23, x25, x19, LSL #1\n" - "59:" // Height 5: Column loop + "51:" // Height 4: No activation + "st1h { z8.h }, p4, [x28]\n" + "st1h { z9.h }, p3, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p4, [x24]\n" + "st1h { z13.h }, p3, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p4, [x23]\n" + "st1h { z17.h }, p3, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x23, #3, MUL VL]\n" + "st1h { z20.h }, p4, [x22]\n" + "st1h { z21.h }, p3, [x22, #1, MUL VL]\n" + "st1h { z22.h }, p2, [x22, #2, MUL VL]\n" + "st1h { z23.h }, p1, [x22, #3, MUL VL]\n" + "52:" // Height 4: Writeback done + "dech x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "54:" // Height 5: Column loop "mov x19, #0x0\n" - "whilelt p4.h, x19, x16\n" + "whilelt p4.h, x19, x11\n" "inch x19\n" - "whilelt p3.h, x19, x16\n" + "whilelt p3.h, x19, x11\n" "inch x19\n" - "whilelt p2.h, x19, x16\n" + "whilelt p2.h, x19, x11\n" "inch x19\n" - "whilelt p1.h, x19, x16\n" - "cbz x14, 60f\n" - "ld1h { z8.h }, p5/Z, [x14]\n" + "whilelt p1.h, x19, x11\n" + "cbz x9, 55f\n" + "ld1h { z8.h }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -1768,31 +1721,36 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z25.d, z9.d\n" "mov z26.d, z10.d\n" "mov z27.d, z11.d\n" - "b 62f\n" - "60:" // Height 5: no bias - "tbz %x[flags], #0, 61f\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" - "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x9]\n" - "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x27]\n" - "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x25]\n" - "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z24.h }, p4/Z, [x23]\n" - "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n" - "b 62f\n" - "61:" // Height 5: no accumulate + "b 57f\n" + "55:" // Height 5: no bias + "tbz %x[flags], #0, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1h { z8.h }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #1\n" + "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #1\n" + "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #1\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "add x21, x22, x19, LSL #1\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x23]\n" + "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x22]\n" + "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x21]\n" + "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n" + "b 57f\n" + "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -1813,240 +1771,240 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" "mov z27.b, #0x0\n" - "62:" // Height 5: setup done - "mov x12, #0x0\n" - "63:" // Height 5: String loop + "57:" // Height 5: setup done + "mov x27, #0x0\n" + "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 64f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 65f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 60f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" - "b 65f\n" - "64:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "65:" // Height 5: input setup done - "cmp x11, #0x8\n" - "ble 67f\n" - "66:" // Height 5: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "add x21, x21, x19, LSL #1\n" + "b 60f\n" + "59:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "60:" // Height 5: input setup done + "cmp x26, #0x8\n" + "ble 62f\n" + "61:" // Height 5: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" "fmla z20.h, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x8\n" "fmla z24.h, z6.h, z4.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.h, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.h, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" "fmla z22.h, z6.h, z3.h[0]\n" "fmla z26.h, z6.h, z4.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" "fmla z19.h, z7.h, z2.h[0]\n" "fmla z23.h, z7.h, z3.h[0]\n" "fmla z27.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" "fmla z20.h, z6.h, z3.h[1]\n" "fmla z24.h, z6.h, z4.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" "fmla z21.h, z7.h, z3.h[1]\n" "fmla z25.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" "fmla z22.h, z6.h, z3.h[1]\n" "fmla z26.h, z6.h, z4.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" "fmla z19.h, z7.h, z2.h[1]\n" "fmla z23.h, z7.h, z3.h[1]\n" "fmla z27.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[2]\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" "fmla z20.h, z6.h, z3.h[2]\n" "fmla z24.h, z6.h, z4.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" "fmla z21.h, z7.h, z3.h[2]\n" "fmla z25.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" "fmla z22.h, z6.h, z3.h[2]\n" "fmla z26.h, z6.h, z4.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" "fmla z19.h, z7.h, z2.h[2]\n" "fmla z23.h, z7.h, z3.h[2]\n" "fmla z27.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[3]\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" "fmla z20.h, z6.h, z3.h[3]\n" "fmla z24.h, z6.h, z4.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" "fmla z21.h, z7.h, z3.h[3]\n" "fmla z25.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" "fmla z22.h, z6.h, z3.h[3]\n" "fmla z26.h, z6.h, z4.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" "fmla z19.h, z7.h, z2.h[3]\n" "fmla z23.h, z7.h, z3.h[3]\n" "fmla z27.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z8.h, z6.h, z0.h[4]\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" "fmla z20.h, z6.h, z3.h[4]\n" "fmla z24.h, z6.h, z4.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" "fmla z21.h, z7.h, z3.h[4]\n" "fmla z25.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" "fmla z22.h, z6.h, z3.h[4]\n" "fmla z26.h, z6.h, z4.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" "fmla z19.h, z7.h, z2.h[4]\n" "fmla z23.h, z7.h, z3.h[4]\n" "fmla z27.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[5]\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" "fmla z20.h, z6.h, z3.h[5]\n" "fmla z24.h, z6.h, z4.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" "fmla z21.h, z7.h, z3.h[5]\n" "fmla z25.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" "fmla z22.h, z6.h, z3.h[5]\n" "fmla z26.h, z6.h, z4.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" "fmla z19.h, z7.h, z2.h[5]\n" "fmla z23.h, z7.h, z3.h[5]\n" "fmla z27.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[6]\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" "fmla z20.h, z6.h, z3.h[6]\n" "fmla z24.h, z6.h, z4.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" "fmla z21.h, z7.h, z3.h[6]\n" "fmla z25.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" "fmla z22.h, z6.h, z3.h[6]\n" "fmla z26.h, z6.h, z4.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" "fmla z19.h, z7.h, z2.h[6]\n" "fmla z23.h, z7.h, z3.h[6]\n" "fmla z27.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[7]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" "fmla z20.h, z6.h, z3.h[7]\n" "fmla z24.h, z6.h, z4.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" "fmla z21.h, z7.h, z3.h[7]\n" "fmla z25.h, z7.h, z4.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" @@ -2057,35 +2015,35 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[7]\n" "fmla z23.h, z7.h, z3.h[7]\n" "fmla z27.h, z7.h, z4.h[7]\n" - "bgt 66b\n" - "67:" // Height 5: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 61b\n" + "62:" // Height 5: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "fmla z13.h, z7.h, z1.h[0]\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "fmla z13.h, z7.h, z1.h[0]\n" + "add x21, x21, #0x10\n" "fmla z17.h, z7.h, z2.h[0]\n" "fmla z20.h, z6.h, z3.h[0]\n" "fmla z24.h, z6.h, z4.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z21.h, z7.h, z3.h[0]\n" "fmla z25.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" @@ -2096,23 +2054,23 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[0]\n" "fmla z23.h, z7.h, z3.h[0]\n" "fmla z27.h, z7.h, z4.h[0]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" "fmla z20.h, z6.h, z3.h[1]\n" "fmla z24.h, z6.h, z4.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" "fmla z21.h, z7.h, z3.h[1]\n" "fmla z25.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" @@ -2123,23 +2081,23 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[1]\n" "fmla z23.h, z7.h, z3.h[1]\n" "fmla z27.h, z7.h, z4.h[1]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" "fmla z20.h, z6.h, z3.h[2]\n" "fmla z24.h, z6.h, z4.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" "fmla z21.h, z7.h, z3.h[2]\n" "fmla z25.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" @@ -2150,23 +2108,23 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[2]\n" "fmla z23.h, z7.h, z3.h[2]\n" "fmla z27.h, z7.h, z4.h[2]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" "fmla z20.h, z6.h, z3.h[3]\n" "fmla z24.h, z6.h, z4.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" "fmla z21.h, z7.h, z3.h[3]\n" "fmla z25.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" @@ -2177,23 +2135,23 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[3]\n" "fmla z23.h, z7.h, z3.h[3]\n" "fmla z27.h, z7.h, z4.h[3]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" "fmla z20.h, z6.h, z3.h[4]\n" "fmla z24.h, z6.h, z4.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" "fmla z21.h, z7.h, z3.h[4]\n" "fmla z25.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" @@ -2204,23 +2162,23 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[4]\n" "fmla z23.h, z7.h, z3.h[4]\n" "fmla z27.h, z7.h, z4.h[4]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" "fmla z20.h, z6.h, z3.h[5]\n" "fmla z24.h, z6.h, z4.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" "fmla z21.h, z7.h, z3.h[5]\n" "fmla z25.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" @@ -2231,23 +2189,23 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[5]\n" "fmla z23.h, z7.h, z3.h[5]\n" "fmla z27.h, z7.h, z4.h[5]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" "fmla z20.h, z6.h, z3.h[6]\n" "fmla z24.h, z6.h, z4.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" "fmla z21.h, z7.h, z3.h[6]\n" "fmla z25.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" @@ -2258,22 +2216,22 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[6]\n" "fmla z23.h, z7.h, z3.h[6]\n" "fmla z27.h, z7.h, z4.h[6]\n" - "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" "fmla z20.h, z6.h, z3.h[7]\n" "fmla z24.h, z6.h, z4.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" "fmla z21.h, z7.h, z3.h[7]\n" "fmla z25.h, z7.h, z4.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" @@ -2284,17 +2242,22 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z19.h, z7.h, z2.h[7]\n" "fmla z23.h, z7.h, z3.h[7]\n" "fmla z27.h, z7.h, z4.h[7]\n" - "68:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "63:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 63b\n" - "tbz %x[flags], #1, 69f\n" + "cmp x27, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "tbz %x[flags], #1, 64f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rh { z1.h }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2339,83 +2302,59 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmax z25.h, p5/M, z25.h, z1.h\n" "fmax z26.h, p5/M, z26.h, z1.h\n" "fmax z27.h, p5/M, z27.h, z1.h\n" - "69:" // Height 5: No activation - "st1h { z8.h }, p4, [x13]\n" - "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" - "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" - "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1h { z12.h }, p4, [x9]\n" - "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" - "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" - "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1h { z16.h }, p4, [x27]\n" - "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" - "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" - "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1h { z20.h }, p4, [x25]\n" - "st1h { z21.h }, p3, [x25, #1, MUL VL]\n" - "st1h { z22.h }, p2, [x25, #2, MUL VL]\n" - "st1h { z23.h }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1h { z24.h }, p4, [x23]\n" - "st1h { z25.h }, p3, [x23, #1, MUL VL]\n" - "st1h { z26.h }, p2, [x23, #2, MUL VL]\n" - "st1h { z27.h }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "70:" // Height 5: Writeback done - "dech x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 59b\n" - "b 86f\n" - "71:" // Height 6 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #1\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #1\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #1\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #1\n" - "add x23, x23, x19, LSL #1\n" - "add x21, x21, x19, LSL #1\n" - "b 73f\n" - "72:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #1\n" - "add x27, x9, x19, LSL #1\n" - "add x25, x27, x19, LSL #1\n" - "add x23, x25, x19, LSL #1\n" - "add x21, x23, x19, LSL #1\n" - "add %x[output_ptr], x21, x19, LSL #1\n" - "73:" // Height 6: Column loop + "64:" // Height 5: No activation + "st1h { z8.h }, p4, [x28]\n" + "st1h { z9.h }, p3, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p4, [x24]\n" + "st1h { z13.h }, p3, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p4, [x23]\n" + "st1h { z17.h }, p3, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x23, #3, MUL VL]\n" + "st1h { z20.h }, p4, [x22]\n" + "st1h { z21.h }, p3, [x22, #1, MUL VL]\n" + "st1h { z22.h }, p2, [x22, #2, MUL VL]\n" + "st1h { z23.h }, p1, [x22, #3, MUL VL]\n" + "st1h { z24.h }, p4, [x21]\n" + "st1h { z25.h }, p3, [x21, #1, MUL VL]\n" + "st1h { z26.h }, p2, [x21, #2, MUL VL]\n" + "st1h { z27.h }, p1, [x21, #3, MUL VL]\n" + "65:" // Height 5: Writeback done + "dech x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0xc\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "67:" // Height 6: Column loop "mov x19, #0x0\n" - "whilelt p4.h, x19, x16\n" + "whilelt p4.h, x19, x11\n" "inch x19\n" - "whilelt p3.h, x19, x16\n" + "whilelt p3.h, x19, x11\n" "inch x19\n" - "whilelt p2.h, x19, x16\n" + "whilelt p2.h, x19, x11\n" "inch x19\n" - "whilelt p1.h, x19, x16\n" - "cbz x14, 74f\n" - "ld1h { z8.h }, p5/Z, [x14]\n" + "whilelt p1.h, x19, x11\n" + "cbz x9, 68f\n" + "ld1h { z8.h }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n" + "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n" + "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -2433,35 +2372,41 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z29.d, z9.d\n" "mov z30.d, z10.d\n" "mov z31.d, z11.d\n" - "b 76f\n" - "74:" // Height 6: no bias - "tbz %x[flags], #0, 75f\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" - "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x9]\n" - "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x27]\n" - "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x25]\n" - "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z24.h }, p4/Z, [x23]\n" - "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n" - "ld1h { z28.h }, p4/Z, [x21]\n" - "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n" - "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n" - "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n" - "b 76f\n" - "75:" // Height 6: no accumulate + "b 70f\n" + "68:" // Height 6: no bias + "tbz %x[flags], #0, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1h { z8.h }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #1\n" + "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n" + "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #1\n" + "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #1\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "add x21, x22, x19, LSL #1\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "add x20, x21, x19, LSL #1\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x23]\n" + "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x22]\n" + "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x21]\n" + "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z28.h }, p4/Z, [x20]\n" + "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n" + "b 70f\n" + "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -2486,77 +2431,77 @@ void sve_hybrid_fp16_mla_6x4VL ( "mov z29.b, #0x0\n" "mov z30.b, #0x0\n" "mov z31.b, #0x0\n" - "76:" // Height 6: setup done - "mov x12, #0x0\n" - "77:" // Height 6: String loop + "70:" // Height 6: setup done + "mov x27, #0x0\n" + "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 78f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 72f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 79f\n" + "cbnz x27, 73f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #1\n" - "add x28, x28, x19, LSL #1\n" - "add x26, x26, x19, LSL #1\n" + "add x25, x25, x19, LSL #1\n" "add x24, x24, x19, LSL #1\n" + "add x23, x23, x19, LSL #1\n" "add x22, x22, x19, LSL #1\n" + "add x21, x21, x19, LSL #1\n" "add x20, x20, x19, LSL #1\n" - "b 79f\n" - "78:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #1\n" - "add x26, x28, x19, LSL #1\n" - "add x24, x26, x19, LSL #1\n" - "add x22, x24, x19, LSL #1\n" - "add x20, x22, x19, LSL #1\n" - "79:" // Height 6: input setup done - "cmp x11, #0x8\n" - "ble 81f\n" - "80:" // Height 6: Multiply loop: Main loop head - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x8\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "b 73f\n" + "72:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "73:" // Height 6: input setup done + "cmp x26, #0x8\n" + "ble 75f\n" + "74:" // Height 6: Multiply loop: Main loop head + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x8\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "fmla z20.h, z6.h, z3.h[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "add x20, x20, #0x10\n" "fmla z24.h, z6.h, z4.h[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x8\n" "fmla z28.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.h, z7.h, z2.h[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.h, z7.h, z3.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla z25.h, z7.h, z4.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "fmla z25.h, z7.h, z4.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla z29.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "fmla z14.h, z6.h, z1.h[0]\n" @@ -2564,198 +2509,198 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z22.h, z6.h, z3.h[0]\n" "fmla z26.h, z6.h, z4.h[0]\n" "fmla z30.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[0]\n" "fmla z15.h, z7.h, z1.h[0]\n" "fmla z19.h, z7.h, z2.h[0]\n" "fmla z23.h, z7.h, z3.h[0]\n" "fmla z27.h, z7.h, z4.h[0]\n" "fmla z31.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[1]\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" "fmla z20.h, z6.h, z3.h[1]\n" "fmla z24.h, z6.h, z4.h[1]\n" "fmla z28.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" "fmla z21.h, z7.h, z3.h[1]\n" "fmla z25.h, z7.h, z4.h[1]\n" "fmla z29.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" "fmla z22.h, z6.h, z3.h[1]\n" "fmla z26.h, z6.h, z4.h[1]\n" "fmla z30.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[1]\n" "fmla z15.h, z7.h, z1.h[1]\n" "fmla z19.h, z7.h, z2.h[1]\n" "fmla z23.h, z7.h, z3.h[1]\n" "fmla z27.h, z7.h, z4.h[1]\n" "fmla z31.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[2]\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" "fmla z20.h, z6.h, z3.h[2]\n" "fmla z24.h, z6.h, z4.h[2]\n" "fmla z28.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" "fmla z21.h, z7.h, z3.h[2]\n" "fmla z25.h, z7.h, z4.h[2]\n" "fmla z29.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" "fmla z22.h, z6.h, z3.h[2]\n" "fmla z26.h, z6.h, z4.h[2]\n" "fmla z30.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[2]\n" "fmla z15.h, z7.h, z1.h[2]\n" "fmla z19.h, z7.h, z2.h[2]\n" "fmla z23.h, z7.h, z3.h[2]\n" "fmla z27.h, z7.h, z4.h[2]\n" "fmla z31.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[3]\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" "fmla z20.h, z6.h, z3.h[3]\n" "fmla z24.h, z6.h, z4.h[3]\n" "fmla z28.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" "fmla z21.h, z7.h, z3.h[3]\n" "fmla z25.h, z7.h, z4.h[3]\n" "fmla z29.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" "fmla z22.h, z6.h, z3.h[3]\n" "fmla z26.h, z6.h, z4.h[3]\n" "fmla z30.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z11.h, z7.h, z0.h[3]\n" "fmla z15.h, z7.h, z1.h[3]\n" "fmla z19.h, z7.h, z2.h[3]\n" "fmla z23.h, z7.h, z3.h[3]\n" "fmla z27.h, z7.h, z4.h[3]\n" "fmla z31.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z8.h, z6.h, z0.h[4]\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" "fmla z20.h, z6.h, z3.h[4]\n" "fmla z24.h, z6.h, z4.h[4]\n" "fmla z28.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" "fmla z21.h, z7.h, z3.h[4]\n" "fmla z25.h, z7.h, z4.h[4]\n" "fmla z29.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" "fmla z22.h, z6.h, z3.h[4]\n" "fmla z26.h, z6.h, z4.h[4]\n" "fmla z30.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[4]\n" "fmla z15.h, z7.h, z1.h[4]\n" "fmla z19.h, z7.h, z2.h[4]\n" "fmla z23.h, z7.h, z3.h[4]\n" "fmla z27.h, z7.h, z4.h[4]\n" "fmla z31.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.h, z6.h, z0.h[5]\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" "fmla z20.h, z6.h, z3.h[5]\n" "fmla z24.h, z6.h, z4.h[5]\n" "fmla z28.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" "fmla z21.h, z7.h, z3.h[5]\n" "fmla z25.h, z7.h, z4.h[5]\n" "fmla z29.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" "fmla z22.h, z6.h, z3.h[5]\n" "fmla z26.h, z6.h, z4.h[5]\n" "fmla z30.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.h, z7.h, z0.h[5]\n" "fmla z15.h, z7.h, z1.h[5]\n" "fmla z19.h, z7.h, z2.h[5]\n" "fmla z23.h, z7.h, z3.h[5]\n" "fmla z27.h, z7.h, z4.h[5]\n" "fmla z31.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.h, z6.h, z0.h[6]\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" "fmla z20.h, z6.h, z3.h[6]\n" "fmla z24.h, z6.h, z4.h[6]\n" "fmla z28.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" "fmla z21.h, z7.h, z3.h[6]\n" "fmla z25.h, z7.h, z4.h[6]\n" "fmla z29.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" "fmla z22.h, z6.h, z3.h[6]\n" "fmla z26.h, z6.h, z4.h[6]\n" "fmla z30.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.h, z7.h, z0.h[6]\n" "fmla z15.h, z7.h, z1.h[6]\n" "fmla z19.h, z7.h, z2.h[6]\n" "fmla z23.h, z7.h, z3.h[6]\n" "fmla z27.h, z7.h, z4.h[6]\n" "fmla z31.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.h, z6.h, z0.h[7]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" "fmla z20.h, z6.h, z3.h[7]\n" "fmla z24.h, z6.h, z4.h[7]\n" "fmla z28.h, z6.h, z5.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" "fmla z21.h, z7.h, z3.h[7]\n" "fmla z25.h, z7.h, z4.h[7]\n" "fmla z29.h, z7.h, z5.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" @@ -2768,39 +2713,39 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[7]\n" "fmla z27.h, z7.h, z4.h[7]\n" "fmla z31.h, z7.h, z5.h[7]\n" - "bgt 80b\n" - "81:" // Height 6: Multiply loop: Single iteration only - "ld1h { z6.h }, p5/Z, [x15]\n" - "whilelt p0.h, XZR, x11\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqh { z0.h }, p0/Z, [x10]\n" + "bgt 74b\n" + "75:" // Height 6: Multiply loop: Single iteration only + "ld1h { z6.h }, p5/Z, [x10]\n" + "whilelt p0.h, XZR, x26\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "fmla z8.h, z6.h, z0.h[0]\n" - "ld1rqh { z1.h }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.h, z7.h, z0.h[0]\n" - "ld1rqh { z2.h }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.h, z6.h, z1.h[0]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.h, z6.h, z2.h[0]\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqh { z4.h }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "fmla z13.h, z7.h, z1.h[0]\n" "ld1rqh { z5.h }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "fmla z20.h, z6.h, z3.h[0]\n" "add x20, x20, #0x10\n" "fmla z17.h, z7.h, z2.h[0]\n" "fmla z24.h, z6.h, z4.h[0]\n" "fmla z28.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z21.h, z7.h, z3.h[0]\n" "fmla z25.h, z7.h, z4.h[0]\n" "fmla z29.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[0]\n" "fmla z14.h, z6.h, z1.h[0]\n" "fmla z18.h, z6.h, z2.h[0]\n" @@ -2813,25 +2758,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[0]\n" "fmla z27.h, z7.h, z4.h[0]\n" "fmla z31.h, z7.h, z5.h[0]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[1]\n" "fmla z16.h, z6.h, z2.h[1]\n" "fmla z20.h, z6.h, z3.h[1]\n" "fmla z24.h, z6.h, z4.h[1]\n" "fmla z28.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[1]\n" "fmla z13.h, z7.h, z1.h[1]\n" "fmla z17.h, z7.h, z2.h[1]\n" "fmla z21.h, z7.h, z3.h[1]\n" "fmla z25.h, z7.h, z4.h[1]\n" "fmla z29.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[1]\n" "fmla z14.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z2.h[1]\n" @@ -2844,25 +2789,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[1]\n" "fmla z27.h, z7.h, z4.h[1]\n" "fmla z31.h, z7.h, z5.h[1]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[2]\n" "fmla z16.h, z6.h, z2.h[2]\n" "fmla z20.h, z6.h, z3.h[2]\n" "fmla z24.h, z6.h, z4.h[2]\n" "fmla z28.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[2]\n" "fmla z13.h, z7.h, z1.h[2]\n" "fmla z17.h, z7.h, z2.h[2]\n" "fmla z21.h, z7.h, z3.h[2]\n" "fmla z25.h, z7.h, z4.h[2]\n" "fmla z29.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[2]\n" "fmla z14.h, z6.h, z1.h[2]\n" "fmla z18.h, z6.h, z2.h[2]\n" @@ -2875,25 +2820,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[2]\n" "fmla z27.h, z7.h, z4.h[2]\n" "fmla z31.h, z7.h, z5.h[2]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[3]\n" "fmla z16.h, z6.h, z2.h[3]\n" "fmla z20.h, z6.h, z3.h[3]\n" "fmla z24.h, z6.h, z4.h[3]\n" "fmla z28.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[3]\n" "fmla z13.h, z7.h, z1.h[3]\n" "fmla z17.h, z7.h, z2.h[3]\n" "fmla z21.h, z7.h, z3.h[3]\n" "fmla z25.h, z7.h, z4.h[3]\n" "fmla z29.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[3]\n" "fmla z14.h, z6.h, z1.h[3]\n" "fmla z18.h, z6.h, z2.h[3]\n" @@ -2906,25 +2851,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[3]\n" "fmla z27.h, z7.h, z4.h[3]\n" "fmla z31.h, z7.h, z5.h[3]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[4]\n" "fmla z16.h, z6.h, z2.h[4]\n" "fmla z20.h, z6.h, z3.h[4]\n" "fmla z24.h, z6.h, z4.h[4]\n" "fmla z28.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[4]\n" "fmla z13.h, z7.h, z1.h[4]\n" "fmla z17.h, z7.h, z2.h[4]\n" "fmla z21.h, z7.h, z3.h[4]\n" "fmla z25.h, z7.h, z4.h[4]\n" "fmla z29.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[4]\n" "fmla z14.h, z6.h, z1.h[4]\n" "fmla z18.h, z6.h, z2.h[4]\n" @@ -2937,25 +2882,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[4]\n" "fmla z27.h, z7.h, z4.h[4]\n" "fmla z31.h, z7.h, z5.h[4]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[5]\n" "fmla z16.h, z6.h, z2.h[5]\n" "fmla z20.h, z6.h, z3.h[5]\n" "fmla z24.h, z6.h, z4.h[5]\n" "fmla z28.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[5]\n" "fmla z13.h, z7.h, z1.h[5]\n" "fmla z17.h, z7.h, z2.h[5]\n" "fmla z21.h, z7.h, z3.h[5]\n" "fmla z25.h, z7.h, z4.h[5]\n" "fmla z29.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[5]\n" "fmla z14.h, z6.h, z1.h[5]\n" "fmla z18.h, z6.h, z2.h[5]\n" @@ -2968,25 +2913,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[5]\n" "fmla z27.h, z7.h, z4.h[5]\n" "fmla z31.h, z7.h, z5.h[5]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.h, z6.h, z1.h[6]\n" "fmla z16.h, z6.h, z2.h[6]\n" "fmla z20.h, z6.h, z3.h[6]\n" "fmla z24.h, z6.h, z4.h[6]\n" "fmla z28.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[6]\n" "fmla z13.h, z7.h, z1.h[6]\n" "fmla z17.h, z7.h, z2.h[6]\n" "fmla z21.h, z7.h, z3.h[6]\n" "fmla z25.h, z7.h, z4.h[6]\n" "fmla z29.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[6]\n" "fmla z14.h, z6.h, z1.h[6]\n" "fmla z18.h, z6.h, z2.h[6]\n" @@ -2999,24 +2944,24 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[6]\n" "fmla z27.h, z7.h, z4.h[6]\n" "fmla z31.h, z7.h, z5.h[6]\n" - "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1h { z6.h }, p5/Z, [x10]\n" "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.h, z6.h, z1.h[7]\n" "fmla z16.h, z6.h, z2.h[7]\n" "fmla z20.h, z6.h, z3.h[7]\n" "fmla z24.h, z6.h, z4.h[7]\n" "fmla z28.h, z6.h, z5.h[7]\n" - "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n" + "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.h, z7.h, z0.h[7]\n" "fmla z13.h, z7.h, z1.h[7]\n" "fmla z17.h, z7.h, z2.h[7]\n" "fmla z21.h, z7.h, z3.h[7]\n" "fmla z25.h, z7.h, z4.h[7]\n" "fmla z29.h, z7.h, z5.h[7]\n" - "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.h, z6.h, z0.h[7]\n" "fmla z14.h, z6.h, z1.h[7]\n" "fmla z18.h, z6.h, z2.h[7]\n" @@ -3029,18 +2974,24 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmla z23.h, z7.h, z3.h[7]\n" "fmla z27.h, z7.h, z4.h[7]\n" "fmla z31.h, z7.h, z5.h[7]\n" - "82:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "76:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 77b\n" - "tbz %x[flags], #1, 83f\n" + "cmp x27, x19\n" + "bne 71b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #1\n" + "add x23, x24, x19, LSL #1\n" + "add x22, x23, x19, LSL #1\n" + "add x21, x22, x19, LSL #1\n" + "add x20, x21, x19, LSL #1\n" + "tbz %x[flags], #1, 77f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rh { z1.h }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -3093,57 +3044,52 @@ void sve_hybrid_fp16_mla_6x4VL ( "fmax z29.h, p5/M, z29.h, z1.h\n" "fmax z30.h, p5/M, z30.h, z1.h\n" "fmax z31.h, p5/M, z31.h, z1.h\n" - "83:" // Height 6: No activation - "st1h { z8.h }, p4, [x13]\n" - "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" - "st1h { z10.h }, p2, [x13, #2, MUL VL]\n" - "st1h { z11.h }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1h { z12.h }, p4, [x9]\n" - "st1h { z13.h }, p3, [x9, #1, MUL VL]\n" - "st1h { z14.h }, p2, [x9, #2, MUL VL]\n" - "st1h { z15.h }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1h { z16.h }, p4, [x27]\n" - "st1h { z17.h }, p3, [x27, #1, MUL VL]\n" - "st1h { z18.h }, p2, [x27, #2, MUL VL]\n" - "st1h { z19.h }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1h { z20.h }, p4, [x25]\n" - "st1h { z21.h }, p3, [x25, #1, MUL VL]\n" - "st1h { z22.h }, p2, [x25, #2, MUL VL]\n" - "st1h { z23.h }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1h { z24.h }, p4, [x23]\n" - "st1h { z25.h }, p3, [x23, #1, MUL VL]\n" - "st1h { z26.h }, p2, [x23, #2, MUL VL]\n" - "st1h { z27.h }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "st1h { z28.h }, p4, [x21]\n" - "st1h { z29.h }, p3, [x21, #1, MUL VL]\n" - "st1h { z30.h }, p2, [x21, #2, MUL VL]\n" - "st1h { z31.h }, p1, [x21, #3, MUL VL]\n" - "addvl x21, x21, #4\n" - "84:" // Height 6: Writeback done - "dech x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 73b\n" + "77:" // Height 6: No activation + "st1h { z8.h }, p4, [x28]\n" + "st1h { z9.h }, p3, [x28, #1, MUL VL]\n" + "st1h { z10.h }, p2, [x28, #2, MUL VL]\n" + "st1h { z11.h }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1h { z12.h }, p4, [x24]\n" + "st1h { z13.h }, p3, [x24, #1, MUL VL]\n" + "st1h { z14.h }, p2, [x24, #2, MUL VL]\n" + "st1h { z15.h }, p1, [x24, #3, MUL VL]\n" + "st1h { z16.h }, p4, [x23]\n" + "st1h { z17.h }, p3, [x23, #1, MUL VL]\n" + "st1h { z18.h }, p2, [x23, #2, MUL VL]\n" + "st1h { z19.h }, p1, [x23, #3, MUL VL]\n" + "st1h { z20.h }, p4, [x22]\n" + "st1h { z21.h }, p3, [x22, #1, MUL VL]\n" + "st1h { z22.h }, p2, [x22, #2, MUL VL]\n" + "st1h { z23.h }, p1, [x22, #3, MUL VL]\n" + "st1h { z24.h }, p4, [x21]\n" + "st1h { z25.h }, p3, [x21, #1, MUL VL]\n" + "st1h { z26.h }, p2, [x21, #2, MUL VL]\n" + "st1h { z27.h }, p1, [x21, #3, MUL VL]\n" + "st1h { z28.h }, p4, [x20]\n" + "st1h { z29.h }, p3, [x20, #1, MUL VL]\n" + "st1h { z30.h }, p2, [x20, #2, MUL VL]\n" + "st1h { z31.h }, p1, [x20, #3, MUL VL]\n" + "78:" // Height 6: Writeback done + "dech x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 67b\n" "subs %x[M], %x[M], #0x6\n" - "beq 86f\n" + "beq 80f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 85f\n" + "tbz %x[flags], #3, 79f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "85:" // Update direct input + "79:" // Update direct input "mov x19, #0xc\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "86:" // Exit + "80:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp index 3c369eb35a..b696e73637 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_fp32_mla_6x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp32_mla_6x4VL; - cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp index db29ebc23c..dee9a107ff 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp @@ -27,6 +27,7 @@ #include "../../utils.hpp" #include +#include namespace arm_gemm { @@ -94,164 +95,158 @@ void sve_hybrid_fp32_mla_6x4VL ( "ptrue p5.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 71f\n" + "bge 66f\n" "cmp %x[M], #0x4\n" - "bgt 57f\n" - "beq 43f\n" + "bgt 53f\n" + "beq 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 27f\n" + "beq 14f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x9, %x[bias]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 4f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" - "b 6f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 5f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "b 6f\n" - "5:" // Height 1: no accumulate + "whilelt p1.s, x19, x11\n" + "cbz x9, 3f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "b 5f\n" + "4:" // Height 1: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" "mov z11.b, #0x0\n" - "6:" // Height 1: setup done - "mov x12, #0x0\n" - "7:" // Height 1: String loop + "5:" // Height 1: setup done + "mov x27, #0x0\n" + "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 8f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 9f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 8f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "b 9f\n" - "8:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "9:" // Height 1: input setup done - "cmp x11, #0x4\n" - "ble 11f\n" - "10:" // Height 1: Multiply loop: Main loop head - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #2\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x26, #0x4\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "cmp x11, #0x4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "cmp x26, #0x4\n" "fmla z10.s, z6.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.s, z6.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" - "bgt 10b\n" - "11:" // Height 1: Multiply loop: Single iteration only - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z11.s, z7.s, z0.s[0]\n" - "ble 12f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.s, z6.s, z0.s[1]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.s, z7.s, z0.s[1]\n" - "ble 12f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" - "addvl x15, x15, #4\n" + "addvl x10, x10, #4\n" "fmla z11.s, z7.s, z0.s[2]\n" - "ble 12f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 11f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" - "12:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" + "11:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 7b\n" - "tbz %x[flags], #1, 13f\n" + "cmp x27, x19\n" + "bne 6b\n" + "tbz %x[flags], #1, 12f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -264,63 +259,56 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmax z9.s, p5/M, z9.s, z1.s\n" "fmax z10.s, p5/M, z10.s, z1.s\n" "fmax z11.s, p5/M, z11.s, z1.s\n" - "13:" // Height 1: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "14:" // Height 1: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 3b\n" - "b 86f\n" - "15:" // Height 2 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 16f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 17f\n" - "16:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "17:" // Height 2: Column loop + "12:" // Height 1: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "13:" // Height 1: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "15:" // Height 2: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 18f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 16f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z13.d, z9.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z14.d, z10.d\n" "mov z15.d, z11.d\n" - "b 20f\n" - "18:" // Height 2: no bias - "tbz %x[flags], #0, 19f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "b 20f\n" - "19:" // Height 2: no accumulate + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "b 18f\n" + "17:" // Height 2: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -329,160 +317,162 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z13.b, #0x0\n" "mov z14.b, #0x0\n" "mov z15.b, #0x0\n" - "20:" // Height 2: setup done - "mov x12, #0x0\n" - "21:" // Height 2: String loop + "18:" // Height 2: setup done + "mov x27, #0x0\n" + "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 22f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 23f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 21f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "b 23f\n" - "22:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "23:" // Height 2: input setup done - "cmp x11, #0x4\n" - "ble 25f\n" - "24:" // Height 2: Multiply loop: Main loop head - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "21:" // Height 2: input setup done + "cmp x26, #0x4\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" - "cmp x11, #0x4\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "cmp x26, #0x4\n" "fmla z13.s, z7.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" "fmla z12.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.s, z6.s, z0.s[2]\n" "fmla z12.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.s, z6.s, z0.s[3]\n" "fmla z12.s, z6.s, z1.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" "fmla z15.s, z7.s, z1.s[3]\n" - "bgt 24b\n" - "25:" // Height 2: Multiply loop: Single iteration only - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z13.s, z7.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" - "ble 26f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" - "ble 26f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" - "ble 26f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 24f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.s, z6.s, z1.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" "fmla z15.s, z7.s, z1.s[3]\n" - "26:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" + "24:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 21b\n" - "tbz %x[flags], #1, 27f\n" + "cmp x27, x19\n" + "bne 19b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "tbz %x[flags], #1, 25f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -503,79 +493,69 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmax z13.s, p5/M, z13.s, z1.s\n" "fmax z14.s, p5/M, z14.s, z1.s\n" "fmax z15.s, p5/M, z15.s, z1.s\n" - "27:" // Height 2: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "28:" // Height 2: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 17b\n" - "b 86f\n" - "29:" // Height 3 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 30f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 31f\n" - "30:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "31:" // Height 3: Column loop + "25:" // Height 2: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "26:" // Height 2: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "28:" // Height 3: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 32f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 29f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" "mov z13.d, z9.d\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" "mov z15.d, z11.d\n" "mov z18.d, z10.d\n" "mov z19.d, z11.d\n" - "b 34f\n" - "32:" // Height 3: no bias - "tbz %x[flags], #0, 33f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "b 34f\n" - "33:" // Height 3: no accumulate + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 31f\n" + "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -588,201 +568,204 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z17.b, #0x0\n" "mov z18.b, #0x0\n" "mov z19.b, #0x0\n" - "34:" // Height 3: setup done - "mov x12, #0x0\n" - "35:" // Height 3: String loop + "31:" // Height 3: setup done + "mov x27, #0x0\n" + "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 36f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 37f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 34f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" - "b 37f\n" - "36:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "37:" // Height 3: input setup done - "cmp x11, #0x4\n" - "ble 39f\n" - "38:" // Height 3: Multiply loop: Main loop head - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "34:" // Height 3: input setup done + "cmp x26, #0x4\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" - "cmp x11, #0x4\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "cmp x26, #0x4\n" "fmla z13.s, z7.s, z1.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z17.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" "fmla z19.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" "fmla z19.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.s, z6.s, z0.s[2]\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" "fmla z19.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.s, z6.s, z0.s[3]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" "fmla z15.s, z7.s, z1.s[3]\n" "fmla z19.s, z7.s, z2.s[3]\n" - "bgt 38b\n" - "39:" // Height 3: Multiply loop: Single iteration only - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "add x26, x26, #0x10\n" + "add x23, x23, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "fmla z16.s, z6.s, z2.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" "fmla z19.s, z7.s, z2.s[0]\n" - "ble 40f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" "fmla z19.s, z7.s, z2.s[1]\n" - "ble 40f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" "fmla z19.s, z7.s, z2.s[2]\n" - "ble 40f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 37f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" "fmla z11.s, z7.s, z0.s[3]\n" "fmla z15.s, z7.s, z1.s[3]\n" "fmla z19.s, z7.s, z2.s[3]\n" - "40:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "37:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 35b\n" - "tbz %x[flags], #1, 41f\n" + "cmp x27, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "tbz %x[flags], #1, 38f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -811,65 +794,48 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmax z17.s, p5/M, z17.s, z1.s\n" "fmax z18.s, p5/M, z18.s, z1.s\n" "fmax z19.s, p5/M, z19.s, z1.s\n" - "41:" // Height 3: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "42:" // Height 3: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 31b\n" - "b 86f\n" - "43:" // Height 4 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 44f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 45f\n" - "44:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "45:" // Height 4: Column loop + "38:" // Height 3: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "39:" // Height 3: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "41:" // Height 4: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 46f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 42f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -879,27 +845,31 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z21.d, z9.d\n" "mov z22.d, z10.d\n" "mov z23.d, z11.d\n" - "b 48f\n" - "46:" // Height 4: no bias - "tbz %x[flags], #0, 47f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "b 48f\n" - "47:" // Height 4: no accumulate + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "b 44f\n" + "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -916,123 +886,123 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z21.b, #0x0\n" "mov z22.b, #0x0\n" "mov z23.b, #0x0\n" - "48:" // Height 4: setup done - "mov x12, #0x0\n" - "49:" // Height 4: String loop + "44:" // Height 4: setup done + "mov x27, #0x0\n" + "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 50f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 51f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 47f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" - "b 51f\n" - "50:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "51:" // Height 4: input setup done - "cmp x11, #0x4\n" - "ble 53f\n" - "52:" // Height 4: Multiply loop: Main loop head - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "47:" // Height 4: input setup done + "cmp x26, #0x4\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x4\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x4\n" "fmla z20.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.s, z7.s, z2.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" "fmla z22.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" "fmla z19.s, z7.s, z2.s[0]\n" "fmla z23.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" "fmla z20.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" "fmla z21.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" "fmla z22.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" "fmla z19.s, z7.s, z2.s[1]\n" "fmla z23.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.s, z6.s, z0.s[2]\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" "fmla z20.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" "fmla z21.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" "fmla z22.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" "fmla z19.s, z7.s, z2.s[2]\n" "fmla z23.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.s, z6.s, z0.s[3]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" "fmla z20.s, z6.s, z3.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" "fmla z21.s, z7.s, z3.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" @@ -1041,31 +1011,31 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z15.s, z7.s, z1.s[3]\n" "fmla z19.s, z7.s, z2.s[3]\n" "fmla z23.s, z7.s, z3.s[3]\n" - "bgt 52b\n" - "53:" // Height 4: Multiply loop: Single iteration only - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "add x24, x24, #0x10\n" + "add x22, x22, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "fmla z17.s, z7.s, z2.s[0]\n" "fmla z20.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z21.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" @@ -1074,21 +1044,21 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z15.s, z7.s, z1.s[0]\n" "fmla z19.s, z7.s, z2.s[0]\n" "fmla z23.s, z7.s, z3.s[0]\n" - "ble 54f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" "fmla z20.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" "fmla z21.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" @@ -1097,21 +1067,21 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z15.s, z7.s, z1.s[1]\n" "fmla z19.s, z7.s, z2.s[1]\n" "fmla z23.s, z7.s, z3.s[1]\n" - "ble 54f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" "fmla z20.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" "fmla z21.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" @@ -1120,20 +1090,20 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z15.s, z7.s, z1.s[2]\n" "fmla z19.s, z7.s, z2.s[2]\n" "fmla z23.s, z7.s, z3.s[2]\n" - "ble 54f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 50f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" "fmla z20.s, z6.s, z3.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" "fmla z21.s, z7.s, z3.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" @@ -1142,16 +1112,20 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z15.s, z7.s, z1.s[3]\n" "fmla z19.s, z7.s, z2.s[3]\n" "fmla z23.s, z7.s, z3.s[3]\n" - "54:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "50:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 49b\n" - "tbz %x[flags], #1, 55f\n" + "cmp x27, x19\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "tbz %x[flags], #1, 51f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1188,73 +1162,52 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmax z21.s, p5/M, z21.s, z1.s\n" "fmax z22.s, p5/M, z22.s, z1.s\n" "fmax z23.s, p5/M, z23.s, z1.s\n" - "55:" // Height 4: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "56:" // Height 4: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 45b\n" - "b 86f\n" - "57:" // Height 5 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 58f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 59f\n" - "58:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "59:" // Height 5: Column loop + "51:" // Height 4: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x22]\n" + "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "52:" // Height 4: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "54:" // Height 5: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 60f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 55f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -1268,31 +1221,36 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z25.d, z9.d\n" "mov z26.d, z10.d\n" "mov z27.d, z11.d\n" - "b 62f\n" - "60:" // Height 5: no bias - "tbz %x[flags], #0, 61f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "b 62f\n" - "61:" // Height 5: no accumulate + "b 57f\n" + "55:" // Height 5: no bias + "tbz %x[flags], #0, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 57f\n" + "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -1313,143 +1271,143 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" "mov z27.b, #0x0\n" - "62:" // Height 5: setup done - "mov x12, #0x0\n" - "63:" // Height 5: String loop + "57:" // Height 5: setup done + "mov x27, #0x0\n" + "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 64f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 65f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 60f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" - "b 65f\n" - "64:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "65:" // Height 5: input setup done - "cmp x11, #0x4\n" - "ble 67f\n" - "66:" // Height 5: Multiply loop: Main loop head - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "add x21, x21, x19, LSL #2\n" + "b 60f\n" + "59:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "60:" // Height 5: input setup done + "cmp x26, #0x4\n" + "ble 62f\n" + "61:" // Height 5: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" "fmla z20.s, z6.s, z3.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x4\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x4\n" "fmla z24.s, z6.s, z4.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.s, z7.s, z2.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.s, z7.s, z3.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.s, z6.s, z0.s[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" "fmla z22.s, z6.s, z3.s[0]\n" "fmla z26.s, z6.s, z4.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" "fmla z19.s, z7.s, z2.s[0]\n" "fmla z23.s, z7.s, z3.s[0]\n" "fmla z27.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" "fmla z20.s, z6.s, z3.s[1]\n" "fmla z24.s, z6.s, z4.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" "fmla z21.s, z7.s, z3.s[1]\n" "fmla z25.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" "fmla z22.s, z6.s, z3.s[1]\n" "fmla z26.s, z6.s, z4.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" "fmla z19.s, z7.s, z2.s[1]\n" "fmla z23.s, z7.s, z3.s[1]\n" "fmla z27.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.s, z6.s, z0.s[2]\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" "fmla z20.s, z6.s, z3.s[2]\n" "fmla z24.s, z6.s, z4.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" "fmla z21.s, z7.s, z3.s[2]\n" "fmla z25.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" "fmla z22.s, z6.s, z3.s[2]\n" "fmla z26.s, z6.s, z4.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" "fmla z19.s, z7.s, z2.s[2]\n" "fmla z23.s, z7.s, z3.s[2]\n" "fmla z27.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.s, z6.s, z0.s[3]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" "fmla z20.s, z6.s, z3.s[3]\n" "fmla z24.s, z6.s, z4.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" "fmla z21.s, z7.s, z3.s[3]\n" "fmla z25.s, z7.s, z4.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" @@ -1460,35 +1418,35 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z19.s, z7.s, z2.s[3]\n" "fmla z23.s, z7.s, z3.s[3]\n" "fmla z27.s, z7.s, z4.s[3]\n" - "bgt 66b\n" - "67:" // Height 5: Multiply loop: Single iteration only - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "bgt 61b\n" + "62:" // Height 5: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "fmla z13.s, z7.s, z1.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "fmla z13.s, z7.s, z1.s[0]\n" + "add x21, x21, #0x10\n" "fmla z17.s, z7.s, z2.s[0]\n" "fmla z20.s, z6.s, z3.s[0]\n" "fmla z24.s, z6.s, z4.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z21.s, z7.s, z3.s[0]\n" "fmla z25.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" @@ -1499,23 +1457,23 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z19.s, z7.s, z2.s[0]\n" "fmla z23.s, z7.s, z3.s[0]\n" "fmla z27.s, z7.s, z4.s[0]\n" - "ble 68f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" "fmla z20.s, z6.s, z3.s[1]\n" "fmla z24.s, z6.s, z4.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" "fmla z21.s, z7.s, z3.s[1]\n" "fmla z25.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" @@ -1526,23 +1484,23 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z19.s, z7.s, z2.s[1]\n" "fmla z23.s, z7.s, z3.s[1]\n" "fmla z27.s, z7.s, z4.s[1]\n" - "ble 68f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" "fmla z20.s, z6.s, z3.s[2]\n" "fmla z24.s, z6.s, z4.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" "fmla z21.s, z7.s, z3.s[2]\n" "fmla z25.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" @@ -1553,22 +1511,22 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z19.s, z7.s, z2.s[2]\n" "fmla z23.s, z7.s, z3.s[2]\n" "fmla z27.s, z7.s, z4.s[2]\n" - "ble 68f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 63f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" "fmla z20.s, z6.s, z3.s[3]\n" "fmla z24.s, z6.s, z4.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" "fmla z21.s, z7.s, z3.s[3]\n" "fmla z25.s, z7.s, z4.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" @@ -1579,17 +1537,22 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z19.s, z7.s, z2.s[3]\n" "fmla z23.s, z7.s, z3.s[3]\n" "fmla z27.s, z7.s, z4.s[3]\n" - "68:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "63:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 63b\n" - "tbz %x[flags], #1, 69f\n" + "cmp x27, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "tbz %x[flags], #1, 64f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1634,83 +1597,59 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmax z25.s, p5/M, z25.s, z1.s\n" "fmax z26.s, p5/M, z26.s, z1.s\n" "fmax z27.s, p5/M, z27.s, z1.s\n" - "69:" // Height 5: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "70:" // Height 5: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 59b\n" - "b 86f\n" - "71:" // Height 6 - "ldr x16, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 73f\n" - "72:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "73:" // Height 6: Column loop + "64:" // Height 5: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x22]\n" + "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x21]\n" + "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "65:" // Height 5: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x18\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "67:" // Height 6: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x16\n" + "whilelt p4.s, x19, x11\n" "incw x19\n" - "whilelt p3.s, x19, x16\n" + "whilelt p3.s, x19, x11\n" "incw x19\n" - "whilelt p2.s, x19, x16\n" + "whilelt p2.s, x19, x11\n" "incw x19\n" - "whilelt p1.s, x19, x16\n" - "cbz x14, 74f\n" - "ld1w { z8.s }, p5/Z, [x14]\n" + "whilelt p1.s, x19, x11\n" + "cbz x9, 68f\n" + "ld1w { z8.s }, p5/Z, [x9]\n" "mov z12.d, z8.d\n" - "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n" + "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n" "mov z16.d, z8.d\n" - "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n" + "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n" "mov z20.d, z8.d\n" - "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "mov z13.d, z9.d\n" "mov z17.d, z9.d\n" "mov z14.d, z10.d\n" @@ -1728,35 +1667,41 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z29.d, z9.d\n" "mov z30.d, z10.d\n" "mov z31.d, z11.d\n" - "b 76f\n" - "74:" // Height 6: no bias - "tbz %x[flags], #0, 75f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" - "b 76f\n" - "75:" // Height 6: no accumulate + "b 70f\n" + "68:" // Height 6: no bias + "tbz %x[flags], #0, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x24, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" + "b 70f\n" + "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" @@ -1781,77 +1726,77 @@ void sve_hybrid_fp32_mla_6x4VL ( "mov z29.b, #0x0\n" "mov z30.b, #0x0\n" "mov z31.b, #0x0\n" - "76:" // Height 6: setup done - "mov x12, #0x0\n" - "77:" // Height 6: String loop + "70:" // Height 6: setup done + "mov x27, #0x0\n" + "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 78f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 72f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 79f\n" + "cbnz x27, 73f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" "add x20, x20, x19, LSL #2\n" - "b 79f\n" - "78:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "add x20, x22, x19, LSL #2\n" - "79:" // Height 6: input setup done - "cmp x11, #0x4\n" - "ble 81f\n" - "80:" // Height 6: Multiply loop: Main loop head - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "sub x11, x11, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "b 73f\n" + "72:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "73:" // Height 6: input setup done + "cmp x26, #0x4\n" + "ble 75f\n" + "74:" // Height 6: Multiply loop: Main loop head + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "sub x26, x26, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "fmla z20.s, z6.s, z3.s[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "add x20, x20, #0x10\n" "fmla z24.s, z6.s, z4.s[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x4\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x4\n" "fmla z28.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z17.s, z7.s, z2.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z21.s, z7.s, z3.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla z25.s, z7.s, z4.s[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "fmla z25.s, z7.s, z4.s[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla z29.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" "fmla z10.s, z6.s, z0.s[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "fmla z14.s, z6.s, z1.s[0]\n" @@ -1859,85 +1804,85 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z22.s, z6.s, z3.s[0]\n" "fmla z26.s, z6.s, z4.s[0]\n" "fmla z30.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[0]\n" "fmla z15.s, z7.s, z1.s[0]\n" "fmla z19.s, z7.s, z2.s[0]\n" "fmla z23.s, z7.s, z3.s[0]\n" "fmla z27.s, z7.s, z4.s[0]\n" "fmla z31.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" "fmla z8.s, z6.s, z0.s[1]\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" "fmla z20.s, z6.s, z3.s[1]\n" "fmla z24.s, z6.s, z4.s[1]\n" "fmla z28.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" "fmla z21.s, z7.s, z3.s[1]\n" "fmla z25.s, z7.s, z4.s[1]\n" "fmla z29.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n" - "addvl x15, x15, #16\n" + "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "addvl x10, x10, #16\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" "fmla z22.s, z6.s, z3.s[1]\n" "fmla z26.s, z6.s, z4.s[1]\n" "fmla z30.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" "fmla z11.s, z7.s, z0.s[1]\n" "fmla z15.s, z7.s, z1.s[1]\n" "fmla z19.s, z7.s, z2.s[1]\n" "fmla z23.s, z7.s, z3.s[1]\n" "fmla z27.s, z7.s, z4.s[1]\n" "fmla z31.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" "fmla z8.s, z6.s, z0.s[2]\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" "fmla z20.s, z6.s, z3.s[2]\n" "fmla z24.s, z6.s, z4.s[2]\n" "fmla z28.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" "fmla z21.s, z7.s, z3.s[2]\n" "fmla z25.s, z7.s, z4.s[2]\n" "fmla z29.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" "fmla z22.s, z6.s, z3.s[2]\n" "fmla z26.s, z6.s, z4.s[2]\n" "fmla z30.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" "fmla z11.s, z7.s, z0.s[2]\n" "fmla z15.s, z7.s, z1.s[2]\n" "fmla z19.s, z7.s, z2.s[2]\n" "fmla z23.s, z7.s, z3.s[2]\n" "fmla z27.s, z7.s, z4.s[2]\n" "fmla z31.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" "fmla z8.s, z6.s, z0.s[3]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" "fmla z20.s, z6.s, z3.s[3]\n" "fmla z24.s, z6.s, z4.s[3]\n" "fmla z28.s, z6.s, z5.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" "fmla z21.s, z7.s, z3.s[3]\n" "fmla z25.s, z7.s, z4.s[3]\n" "fmla z29.s, z7.s, z5.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" @@ -1950,39 +1895,39 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z23.s, z7.s, z3.s[3]\n" "fmla z27.s, z7.s, z4.s[3]\n" "fmla z31.s, z7.s, z5.s[3]\n" - "bgt 80b\n" - "81:" // Height 6: Multiply loop: Single iteration only - "ld1w { z6.s }, p5/Z, [x15]\n" - "whilelt p0.s, XZR, x11\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x10]\n" + "bgt 74b\n" + "75:" // Height 6: Multiply loop: Single iteration only + "ld1w { z6.s }, p5/Z, [x10]\n" + "whilelt p0.s, XZR, x26\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "fmla z8.s, z6.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z9.s, z7.s, z0.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z12.s, z6.s, z1.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z16.s, z6.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "fmla z13.s, z7.s, z1.s[0]\n" "ld1rqw { z5.s }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "fmla z20.s, z6.s, z3.s[0]\n" "add x20, x20, #0x10\n" "fmla z17.s, z7.s, z2.s[0]\n" "fmla z24.s, z6.s, z4.s[0]\n" "fmla z28.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z21.s, z7.s, z3.s[0]\n" "fmla z25.s, z7.s, z4.s[0]\n" "fmla z29.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[0]\n" "fmla z14.s, z6.s, z1.s[0]\n" "fmla z18.s, z6.s, z2.s[0]\n" @@ -1995,25 +1940,25 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z23.s, z7.s, z3.s[0]\n" "fmla z27.s, z7.s, z4.s[0]\n" "fmla z31.s, z7.s, z5.s[0]\n" - "ble 82f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[1]\n" "fmla z16.s, z6.s, z2.s[1]\n" "fmla z20.s, z6.s, z3.s[1]\n" "fmla z24.s, z6.s, z4.s[1]\n" "fmla z28.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[1]\n" "fmla z13.s, z7.s, z1.s[1]\n" "fmla z17.s, z7.s, z2.s[1]\n" "fmla z21.s, z7.s, z3.s[1]\n" "fmla z25.s, z7.s, z4.s[1]\n" "fmla z29.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[1]\n" "fmla z14.s, z6.s, z1.s[1]\n" "fmla z18.s, z6.s, z2.s[1]\n" @@ -2026,25 +1971,25 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z23.s, z7.s, z3.s[1]\n" "fmla z27.s, z7.s, z4.s[1]\n" "fmla z31.s, z7.s, z5.s[1]\n" - "ble 82f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" - "subs x11, x11, #0x1\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" + "subs x26, x26, #0x1\n" "fmla z12.s, z6.s, z1.s[2]\n" "fmla z16.s, z6.s, z2.s[2]\n" "fmla z20.s, z6.s, z3.s[2]\n" "fmla z24.s, z6.s, z4.s[2]\n" "fmla z28.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[2]\n" "fmla z13.s, z7.s, z1.s[2]\n" "fmla z17.s, z7.s, z2.s[2]\n" "fmla z21.s, z7.s, z3.s[2]\n" "fmla z25.s, z7.s, z4.s[2]\n" "fmla z29.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[2]\n" "fmla z14.s, z6.s, z1.s[2]\n" "fmla z18.s, z6.s, z2.s[2]\n" @@ -2057,24 +2002,24 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z23.s, z7.s, z3.s[2]\n" "fmla z27.s, z7.s, z4.s[2]\n" "fmla z31.s, z7.s, z5.s[2]\n" - "ble 82f\n" - "ld1w { z6.s }, p5/Z, [x15]\n" + "ble 76f\n" + "ld1w { z6.s }, p5/Z, [x10]\n" "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" "fmla z12.s, z6.s, z1.s[3]\n" "fmla z16.s, z6.s, z2.s[3]\n" "fmla z20.s, z6.s, z3.s[3]\n" "fmla z24.s, z6.s, z4.s[3]\n" "fmla z28.s, z6.s, z5.s[3]\n" - "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n" + "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" "fmla z9.s, z7.s, z0.s[3]\n" "fmla z13.s, z7.s, z1.s[3]\n" "fmla z17.s, z7.s, z2.s[3]\n" "fmla z21.s, z7.s, z3.s[3]\n" "fmla z25.s, z7.s, z4.s[3]\n" "fmla z29.s, z7.s, z5.s[3]\n" - "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n" - "addvl x15, x15, #4\n" + "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "addvl x10, x10, #4\n" "fmla z10.s, z6.s, z0.s[3]\n" "fmla z14.s, z6.s, z1.s[3]\n" "fmla z18.s, z6.s, z2.s[3]\n" @@ -2087,18 +2032,24 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmla z23.s, z7.s, z3.s[3]\n" "fmla z27.s, z7.s, z4.s[3]\n" "fmla z31.s, z7.s, z5.s[3]\n" - "82:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "76:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 77b\n" - "tbz %x[flags], #1, 83f\n" + "cmp x27, x19\n" + "bne 71b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x24, x28, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "tbz %x[flags], #1, 77f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z1.s }, p5/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -2151,57 +2102,52 @@ void sve_hybrid_fp32_mla_6x4VL ( "fmax z29.s, p5/M, z29.s, z1.s\n" "fmax z30.s, p5/M, z30.s, z1.s\n" "fmax z31.s, p5/M, z31.s, z1.s\n" - "83:" // Height 6: No activation - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "st1w { z28.s }, p4, [x21]\n" - "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" - "addvl x21, x21, #4\n" - "84:" // Height 6: Writeback done - "decw x16, ALL, MUL #4\n" - "cmp x16, XZR\n" - "bgt 73b\n" + "77:" // Height 6: No activation + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x24]\n" + "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x23]\n" + "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x22]\n" + "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x21]\n" + "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z28.s }, p4, [x20]\n" + "st1w { z29.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x20, #3, MUL VL]\n" + "78:" // Height 6: Writeback done + "decw x11, ALL, MUL #4\n" + "cmp x11, XZR\n" + "bgt 67b\n" "subs %x[M], %x[M], #0x6\n" - "beq 86f\n" + "beq 80f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 85f\n" + "tbz %x[flags], #3, 79f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "85:" // Update direct input + "79:" // Update direct input "mov x19, #0x18\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "86:" // Exit + "80:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp index 5238a9ba12..2273d97d5f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_fp32_mla_8x1VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp32_mla_8x1VL; - cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp index 0e45b06765..863325f7f5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp @@ -27,6 +27,7 @@ #include "../../utils.hpp" #include +#include namespace arm_gemm { @@ -94,230 +95,219 @@ void sve_hybrid_fp32_mla_8x1VL ( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x8\n" - "bge 99f\n" + "bge 92f\n" "cmp %x[M], #0x6\n" - "bgt 85f\n" - "beq 71f\n" + "bgt 79f\n" + "beq 66f\n" "cmp %x[M], #0x4\n" - "bgt 57f\n" - "beq 43f\n" + "bgt 53f\n" + "beq 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x8, %x[bias]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x17, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 27f\n" + "beq 14f\n" + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x11, %x[bias]\n" + "mov x10, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 4f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" - "addvl x8, x8, #1\n" - "b 6f\n" - "4:" // Height 1: no bias - "tbz %x[flags], #0, 5f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "b 6f\n" - "5:" // Height 1: no accumulate + "whilelt p1.s, x19, x13\n" + "cbz x11, 3f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" + "addvl x11, x11, #1\n" + "b 5f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 4f\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "b 5f\n" + "4:" // Height 1: no accumulate "mov z24.b, #0x0\n" - "6:" // Height 1: setup done - "mov x16, #0x0\n" - "7:" // Height 1: String loop + "5:" // Height 1: setup done + "mov x9, #0x0\n" + "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 8f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 7f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "cbnz x16, 9f\n" + "ldr x27, [x20, #0x0]\n" + "cbnz x9, 8f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "b 9f\n" - "8:" // Height 1: setup direct input - "mov x14, %x[input_ptr]\n" - "9:" // Height 1: input setup done - "cmp x15, #0x4\n" - "ble 11f\n" - "10:" // Height 1: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x27, x27, x19, LSL #2\n" + "b 8f\n" + "7:" // Height 1: setup direct input + "mov x27, %x[input_ptr]\n" + "8:" // Height 1: input setup done + "cmp x28, #0x4\n" + "ble 10f\n" + "9:" // Height 1: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" - "add x14, x14, #0x10\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x27, x27, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z24.s, z11.s, z0.s[3]\n" - "bgt 10b\n" - "11:" // Height 1: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "add x14, x14, #0x10\n" - "addvl x7, x7, #1\n" - "ble 12f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "addvl x7, x7, #1\n" - "ble 12f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "addvl x7, x7, #1\n" - "ble 12f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "12:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" + "bgt 9b\n" + "10:" // Height 1: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "add x27, x27, #0x10\n" + "addvl x12, x12, #1\n" + "ble 11f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "addvl x12, x12, #1\n" + "ble 11f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "addvl x12, x12, #1\n" + "ble 11f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "11:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 7b\n" - "tbz %x[flags], #1, 13f\n" + "cmp x9, x19\n" + "bne 6b\n" + "tbz %x[flags], #1, 12f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" "ld1rw { z16.s }, p2/Z, [x19]\n" "fmin z24.s, p2/M, z24.s, z16.s\n" "fmax z24.s, p2/M, z24.s, z17.s\n" - "13:" // Height 1: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "14:" // Height 1: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 3b\n" - "b 114f\n" - "15:" // Height 2 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 16f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "add x13, x13, x19, LSL #2\n" - "b 17f\n" - "16:" // Height 2: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "17:" // Height 2: Column loop + "12:" // Height 1: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "13:" // Height 1: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 2b\n" + "b 106f\n" + "14:" // Height 2 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "15:" // Height 2: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 18f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 16f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" - "b 20f\n" - "18:" // Height 2: no bias - "tbz %x[flags], #0, 19f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "b 20f\n" - "19:" // Height 2: no accumulate + "addvl x11, x11, #1\n" + "b 18f\n" + "16:" // Height 2: no bias + "tbz %x[flags], #0, 17f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "b 18f\n" + "17:" // Height 2: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" - "20:" // Height 2: setup done - "mov x16, #0x0\n" - "21:" // Height 2: String loop + "18:" // Height 2: setup done + "mov x9, #0x0\n" + "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 22f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 20f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "cbnz x16, 23f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "cbnz x9, 21f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "b 23f\n" - "22:" // Height 2: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "23:" // Height 2: input setup done - "cmp x15, #0x4\n" - "ble 25f\n" - "24:" // Height 2: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "b 21f\n" + "20:" // Height 2: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "21:" // Height 2: input setup done + "cmp x28, #0x4\n" + "ble 23f\n" + "22:" // Height 2: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" - "add x12, x12, #0x10\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" "fmla z25.s, z11.s, z1.s[3]\n" - "bgt 24b\n" - "25:" // Height 2: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "add x12, x12, #0x10\n" - "addvl x7, x7, #1\n" - "ble 26f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "ble 26f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "ble 26f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "26:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" + "bgt 22b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "add x26, x26, #0x10\n" + "addvl x12, x12, #1\n" + "ble 24f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "ble 24f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "ble 24f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "24:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" + "prfm pldl1keep, [x26, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 21b\n" - "tbz %x[flags], #1, 27f\n" + "cmp x9, x19\n" + "bne 19b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "tbz %x[flags], #1, 25f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -326,151 +316,144 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmin z25.s, p2/M, z25.s, z16.s\n" "fmax z24.s, p2/M, z24.s, z17.s\n" "fmax z25.s, p2/M, z25.s, z17.s\n" - "27:" // Height 2: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "28:" // Height 2: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 17b\n" - "b 114f\n" - "29:" // Height 3 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 30f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "add x11, x11, x19, LSL #2\n" - "b 31f\n" - "30:" // Height 3: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "31:" // Height 3: Column loop + "25:" // Height 2: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "26:" // Height 2: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 15b\n" + "b 106f\n" + "27:" // Height 3 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "28:" // Height 3: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 32f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 29f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" + "addvl x11, x11, #1\n" "mov z26.d, z24.d\n" - "b 34f\n" - "32:" // Height 3: no bias - "tbz %x[flags], #0, 33f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "ld1w { z26.s }, p1/Z, [x11]\n" - "b 34f\n" - "33:" // Height 3: no accumulate + "b 31f\n" + "29:" // Height 3: no bias + "tbz %x[flags], #0, 30f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z26.s }, p1/Z, [x25]\n" + "b 31f\n" + "30:" // Height 3: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" - "34:" // Height 3: setup done - "mov x16, #0x0\n" - "35:" // Height 3: String loop + "31:" // Height 3: setup done + "mov x9, #0x0\n" + "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 36f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "cbnz x16, 37f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "cbnz x9, 34f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "b 37f\n" - "36:" // Height 3: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "37:" // Height 3: input setup done - "cmp x15, #0x4\n" - "ble 39f\n" - "38:" // Height 3: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "34:" // Height 3: input setup done + "cmp x28, #0x4\n" + "ble 36f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z26.s, z9.s, z2.s[1]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z25.s, z11.s, z1.s[3]\n" "fmla z26.s, z11.s, z2.s[3]\n" - "bgt 38b\n" - "39:" // Height 3: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" - "fmla z26.s, z12.s, z2.s[0]\n" - "add x10, x10, #0x10\n" - "addvl x7, x7, #1\n" - "ble 40f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z13.s, z2.s[1]\n" - "ble 40f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z14.s, z2.s[2]\n" - "ble 40f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "fmla z26.s, z15.s, z2.s[3]\n" - "40:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "bgt 35b\n" + "36:" // Height 3: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "add x25, x25, #0x10\n" + "addvl x12, x12, #1\n" + "ble 37f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "ble 37f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ble 37f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "37:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 35b\n" - "tbz %x[flags], #1, 41f\n" + "cmp x9, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "tbz %x[flags], #1, 38f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -481,117 +464,104 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmax z24.s, p2/M, z24.s, z17.s\n" "fmax z25.s, p2/M, z25.s, z17.s\n" "fmax z26.s, p2/M, z26.s, z17.s\n" - "41:" // Height 3: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "st1w { z26.s }, p1, [x11]\n" - "addvl x11, x11, #1\n" - "42:" // Height 3: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 31b\n" - "b 114f\n" - "43:" // Height 4 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 44f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "add x11, x11, x19, LSL #2\n" - "add x9, x9, x19, LSL #2\n" - "b 45f\n" - "44:" // Height 4: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "45:" // Height 4: Column loop + "38:" // Height 3: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "st1w { z26.s }, p1, [x25]\n" + "39:" // Height 3: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 28b\n" + "b 106f\n" + "40:" // Height 4 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "41:" // Height 4: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 46f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 42f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" + "addvl x11, x11, #1\n" "mov z26.d, z24.d\n" "mov z27.d, z24.d\n" - "b 48f\n" - "46:" // Height 4: no bias - "tbz %x[flags], #0, 47f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "ld1w { z26.s }, p1/Z, [x11]\n" - "ld1w { z27.s }, p1/Z, [x9]\n" - "b 48f\n" - "47:" // Height 4: no accumulate + "b 44f\n" + "42:" // Height 4: no bias + "tbz %x[flags], #0, 43f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z26.s }, p1/Z, [x25]\n" + "add x24, x25, x19, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x24]\n" + "b 44f\n" + "43:" // Height 4: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" "mov z27.b, #0x0\n" - "48:" // Height 4: setup done - "mov x16, #0x0\n" - "49:" // Height 4: String loop + "44:" // Height 4: setup done + "mov x9, #0x0\n" + "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 50f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 46f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "cbnz x16, 51f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "cbnz x9, 47f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" - "b 51f\n" - "50:" // Height 4: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "51:" // Height 4: input setup done - "cmp x15, #0x4\n" - "ble 53f\n" - "52:" // Height 4: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x27, x27, x19, LSL #2\n" + "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "b 47f\n" + "46:" // Height 4: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "47:" // Height 4: input setup done + "cmp x28, #0x4\n" + "ble 49f\n" + "48:" // Height 4: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" - "add x28, x28, #0x10\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x24, x24, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z25.s, z9.s, z1.s[1]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z24.s, z11.s, z0.s[3]\n" @@ -600,57 +570,61 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z25.s, z11.s, z1.s[3]\n" "fmla z26.s, z11.s, z2.s[3]\n" "fmla z27.s, z11.s, z3.s[3]\n" - "bgt 52b\n" - "53:" // Height 4: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" - "fmla z26.s, z12.s, z2.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" - "fmla z27.s, z12.s, z3.s[0]\n" - "add x28, x28, #0x10\n" - "addvl x7, x7, #1\n" - "ble 54f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z13.s, z2.s[1]\n" - "fmla z27.s, z13.s, z3.s[1]\n" - "ble 54f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z14.s, z2.s[2]\n" - "fmla z27.s, z14.s, z3.s[2]\n" - "ble 54f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "fmla z26.s, z15.s, z2.s[3]\n" - "fmla z27.s, z15.s, z3.s[3]\n" - "54:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "bgt 48b\n" + "49:" // Height 4: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "add x24, x24, #0x10\n" + "addvl x12, x12, #1\n" + "ble 50f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "ble 50f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "ble 50f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "50:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 49b\n" - "tbz %x[flags], #1, 55f\n" + "cmp x9, x19\n" + "bne 45b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "tbz %x[flags], #1, 51f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -663,134 +637,118 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmax z25.s, p2/M, z25.s, z17.s\n" "fmax z26.s, p2/M, z26.s, z17.s\n" "fmax z27.s, p2/M, z27.s, z17.s\n" - "55:" // Height 4: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "st1w { z26.s }, p1, [x11]\n" - "addvl x11, x11, #1\n" - "st1w { z27.s }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "56:" // Height 4: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 45b\n" - "b 114f\n" - "57:" // Height 5 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 58f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 59f\n" - "58:" // Height 5: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "59:" // Height 5: Column loop + "51:" // Height 4: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "st1w { z26.s }, p1, [x25]\n" + "st1w { z27.s }, p1, [x24]\n" + "52:" // Height 4: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 41b\n" + "b 106f\n" + "53:" // Height 5 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "54:" // Height 5: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 60f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 55f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" + "addvl x11, x11, #1\n" "mov z26.d, z24.d\n" "mov z27.d, z24.d\n" "mov z28.d, z24.d\n" - "b 62f\n" - "60:" // Height 5: no bias - "tbz %x[flags], #0, 61f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "ld1w { z26.s }, p1/Z, [x11]\n" - "ld1w { z27.s }, p1/Z, [x9]\n" - "ld1w { z28.s }, p1/Z, [x27]\n" - "b 62f\n" - "61:" // Height 5: no accumulate + "b 57f\n" + "55:" // Height 5: no bias + "tbz %x[flags], #0, 56f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z26.s }, p1/Z, [x25]\n" + "add x24, x25, x19, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x24]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z28.s }, p1/Z, [x23]\n" + "b 57f\n" + "56:" // Height 5: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" "mov z27.b, #0x0\n" "mov z28.b, #0x0\n" - "62:" // Height 5: setup done - "mov x16, #0x0\n" - "63:" // Height 5: String loop + "57:" // Height 5: setup done + "mov x9, #0x0\n" + "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 64f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 59f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "cbnz x16, 65f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "cbnz x9, 60f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" - "b 65f\n" - "64:" // Height 5: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "65:" // Height 5: input setup done - "cmp x15, #0x4\n" - "ble 67f\n" - "66:" // Height 5: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x25, x25, x19, LSL #2\n" + "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" + "b 60f\n" + "59:" // Height 5: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "60:" // Height 5: input setup done + "cmp x28, #0x4\n" + "ble 62f\n" + "61:" // Height 5: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" - "add x26, x26, #0x10\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x23, x23, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z28.s, z8.s, z4.s[0]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -800,64 +758,69 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z26.s, z11.s, z2.s[3]\n" "fmla z27.s, z11.s, z3.s[3]\n" "fmla z28.s, z11.s, z4.s[3]\n" - "bgt 66b\n" - "67:" // Height 5: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" - "fmla z26.s, z12.s, z2.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" - "fmla z27.s, z12.s, z3.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" - "fmla z28.s, z12.s, z4.s[0]\n" + "bgt 61b\n" + "62:" // Height 5: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "add x26, x26, #0x10\n" - "addvl x7, x7, #1\n" - "ble 68f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z13.s, z2.s[1]\n" - "fmla z27.s, z13.s, z3.s[1]\n" - "fmla z28.s, z13.s, z4.s[1]\n" - "ble 68f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z14.s, z2.s[2]\n" - "fmla z27.s, z14.s, z3.s[2]\n" - "fmla z28.s, z14.s, z4.s[2]\n" - "ble 68f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "fmla z26.s, z15.s, z2.s[3]\n" - "fmla z27.s, z15.s, z3.s[3]\n" - "fmla z28.s, z15.s, z4.s[3]\n" - "68:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "add x23, x23, #0x10\n" + "addvl x12, x12, #1\n" + "ble 63f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "ble 63f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "ble 63f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "63:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 63b\n" - "tbz %x[flags], #1, 69f\n" + "cmp x9, x19\n" + "bne 58b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "tbz %x[flags], #1, 64f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -872,150 +835,131 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmax z26.s, p2/M, z26.s, z17.s\n" "fmax z27.s, p2/M, z27.s, z17.s\n" "fmax z28.s, p2/M, z28.s, z17.s\n" - "69:" // Height 5: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "st1w { z26.s }, p1, [x11]\n" - "addvl x11, x11, #1\n" - "st1w { z27.s }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "st1w { z28.s }, p1, [x27]\n" - "addvl x27, x27, #1\n" - "70:" // Height 5: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 59b\n" - "b 114f\n" - "71:" // Height 6 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x28]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 73f\n" - "72:" // Height 6: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "73:" // Height 6: Column loop + "64:" // Height 5: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "st1w { z26.s }, p1, [x25]\n" + "st1w { z27.s }, p1, [x24]\n" + "st1w { z28.s }, p1, [x23]\n" + "65:" // Height 5: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 54b\n" + "b 106f\n" + "66:" // Height 6 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "67:" // Height 6: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 74f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 68f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" + "addvl x11, x11, #1\n" "mov z26.d, z24.d\n" "mov z27.d, z24.d\n" "mov z28.d, z24.d\n" "mov z29.d, z24.d\n" - "b 76f\n" - "74:" // Height 6: no bias - "tbz %x[flags], #0, 75f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "ld1w { z26.s }, p1/Z, [x11]\n" - "ld1w { z27.s }, p1/Z, [x9]\n" - "ld1w { z28.s }, p1/Z, [x27]\n" - "ld1w { z29.s }, p1/Z, [x25]\n" - "b 76f\n" - "75:" // Height 6: no accumulate + "b 70f\n" + "68:" // Height 6: no bias + "tbz %x[flags], #0, 69f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z26.s }, p1/Z, [x25]\n" + "add x24, x25, x19, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x24]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z28.s }, p1/Z, [x23]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z29.s }, p1/Z, [x22]\n" + "b 70f\n" + "69:" // Height 6: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" "mov z27.b, #0x0\n" "mov z28.b, #0x0\n" "mov z29.b, #0x0\n" - "76:" // Height 6: setup done - "mov x16, #0x0\n" - "77:" // Height 6: String loop + "70:" // Height 6: setup done + "mov x9, #0x0\n" + "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 78f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 72f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "ldr x24, [x20, #0x28]\n" - "cbnz x16, 79f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "cbnz x9, 73f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" - "b 79f\n" - "78:" // Height 6: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "79:" // Height 6: input setup done - "cmp x15, #0x4\n" - "ble 81f\n" - "80:" // Height 6: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x23, x23, x19, LSL #2\n" + "add x22, x22, x19, LSL #2\n" + "b 73f\n" + "72:" // Height 6: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "73:" // Height 6: input setup done + "cmp x28, #0x4\n" + "ble 75f\n" + "74:" // Height 6: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" - "ld1rqw { z5.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" - "add x24, x24, #0x10\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x22, x22, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z29.s, z8.s, z5.s[0]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -1027,71 +971,77 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z27.s, z11.s, z3.s[3]\n" "fmla z28.s, z11.s, z4.s[3]\n" "fmla z29.s, z11.s, z5.s[3]\n" - "bgt 80b\n" - "81:" // Height 6: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" - "fmla z26.s, z12.s, z2.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" - "fmla z27.s, z12.s, z3.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" - "fmla z28.s, z12.s, z4.s[0]\n" - "ld1rqw { z5.s }, p0/Z, [x24]\n" + "bgt 74b\n" + "75:" // Height 6: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "add x26, x26, #0x10\n" - "fmla z29.s, z12.s, z5.s[0]\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" - "addvl x7, x7, #1\n" - "ble 82f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z13.s, z2.s[1]\n" - "fmla z27.s, z13.s, z3.s[1]\n" - "fmla z28.s, z13.s, z4.s[1]\n" - "fmla z29.s, z13.s, z5.s[1]\n" - "ble 82f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z14.s, z2.s[2]\n" - "fmla z27.s, z14.s, z3.s[2]\n" - "fmla z28.s, z14.s, z4.s[2]\n" - "fmla z29.s, z14.s, z5.s[2]\n" - "ble 82f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "fmla z26.s, z15.s, z2.s[3]\n" - "fmla z27.s, z15.s, z3.s[3]\n" - "fmla z28.s, z15.s, z4.s[3]\n" - "fmla z29.s, z15.s, z5.s[3]\n" - "82:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "add x22, x22, #0x10\n" + "addvl x12, x12, #1\n" + "ble 76f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "fmla z29.s, z9.s, z5.s[1]\n" + "ble 76f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z29.s, z10.s, z5.s[2]\n" + "ble 76f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z29.s, z11.s, z5.s[3]\n" + "76:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 77b\n" - "tbz %x[flags], #1, 83f\n" + "cmp x9, x19\n" + "bne 71b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "tbz %x[flags], #1, 77f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1108,77 +1058,55 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmax z28.s, p2/M, z28.s, z17.s\n" "fmin z29.s, p2/M, z29.s, z16.s\n" "fmax z29.s, p2/M, z29.s, z17.s\n" - "83:" // Height 6: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "st1w { z26.s }, p1, [x11]\n" - "addvl x11, x11, #1\n" - "st1w { z27.s }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "st1w { z28.s }, p1, [x27]\n" - "addvl x27, x27, #1\n" - "st1w { z29.s }, p1, [x25]\n" - "addvl x25, x25, #1\n" - "84:" // Height 6: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 73b\n" - "b 114f\n" - "85:" // Height 7 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 86f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x28]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x23, [%x[output_ptr], #0x30]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 87f\n" - "86:" // Height 7: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "87:" // Height 7: Column loop + "77:" // Height 6: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "st1w { z26.s }, p1, [x25]\n" + "st1w { z27.s }, p1, [x24]\n" + "st1w { z28.s }, p1, [x23]\n" + "st1w { z29.s }, p1, [x22]\n" + "78:" // Height 6: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 67b\n" + "b 106f\n" + "79:" // Height 7 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "80:" // Height 7: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 88f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 81f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" + "addvl x11, x11, #1\n" "mov z26.d, z24.d\n" "mov z27.d, z24.d\n" "mov z28.d, z24.d\n" "mov z29.d, z24.d\n" "mov z30.d, z24.d\n" - "b 90f\n" - "88:" // Height 7: no bias - "tbz %x[flags], #0, 89f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "ld1w { z26.s }, p1/Z, [x11]\n" - "ld1w { z27.s }, p1/Z, [x9]\n" - "ld1w { z28.s }, p1/Z, [x27]\n" - "ld1w { z29.s }, p1/Z, [x25]\n" - "ld1w { z30.s }, p1/Z, [x23]\n" - "b 90f\n" - "89:" // Height 7: no accumulate + "b 83f\n" + "81:" // Height 7: no bias + "tbz %x[flags], #0, 82f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z26.s }, p1/Z, [x25]\n" + "add x24, x25, x19, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x24]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z28.s }, p1/Z, [x23]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z29.s }, p1/Z, [x22]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z30.s }, p1/Z, [x21]\n" + "b 83f\n" + "82:" // Height 7: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" @@ -1186,88 +1114,88 @@ void sve_hybrid_fp32_mla_8x1VL ( "mov z28.b, #0x0\n" "mov z29.b, #0x0\n" "mov z30.b, #0x0\n" - "90:" // Height 7: setup done - "mov x16, #0x0\n" - "91:" // Height 7: String loop + "83:" // Height 7: setup done + "mov x9, #0x0\n" + "84:" // Height 7: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 92f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 85f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "ldr x24, [x20, #0x28]\n" - "ldr x22, [x20, #0x30]\n" - "cbnz x16, 93f\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "ldr x21, [x20, #0x30]\n" + "cbnz x9, 86f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" - "b 93f\n" - "92:" // Height 7: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "93:" // Height 7: input setup done - "cmp x15, #0x4\n" - "ble 95f\n" - "94:" // Height 7: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "add x21, x21, x19, LSL #2\n" + "b 86f\n" + "85:" // Height 7: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "86:" // Height 7: input setup done + "cmp x28, #0x4\n" + "ble 88f\n" + "87:" // Height 7: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" - "ld1rqw { z5.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "ld1rqw { z6.s }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "ld1rqw { z6.s }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" + "add x21, x21, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z30.s, z8.s, z6.s[0]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z30.s, z9.s, z6.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla z24.s, z10.s, z0.s[2]\n" "prfm pldl1keep, [x22, #0x80]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" "fmla z26.s, z10.s, z2.s[2]\n" "fmla z27.s, z10.s, z3.s[2]\n" @@ -1281,78 +1209,85 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z28.s, z11.s, z4.s[3]\n" "fmla z29.s, z11.s, z5.s[3]\n" "fmla z30.s, z11.s, z6.s[3]\n" - "bgt 94b\n" - "95:" // Height 7: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" - "fmla z26.s, z12.s, z2.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" - "fmla z27.s, z12.s, z3.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" - "fmla z28.s, z12.s, z4.s[0]\n" - "ld1rqw { z5.s }, p0/Z, [x24]\n" + "bgt 87b\n" + "88:" // Height 7: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "add x26, x26, #0x10\n" - "fmla z29.s, z12.s, z5.s[0]\n" - "ld1rqw { z6.s }, p0/Z, [x22]\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" - "fmla z30.s, z12.s, z6.s[0]\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "ld1rqw { z6.s }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" - "addvl x7, x7, #1\n" - "ble 96f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z13.s, z2.s[1]\n" - "fmla z27.s, z13.s, z3.s[1]\n" - "fmla z28.s, z13.s, z4.s[1]\n" - "fmla z29.s, z13.s, z5.s[1]\n" - "fmla z30.s, z13.s, z6.s[1]\n" - "ble 96f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z14.s, z2.s[2]\n" - "fmla z27.s, z14.s, z3.s[2]\n" - "fmla z28.s, z14.s, z4.s[2]\n" - "fmla z29.s, z14.s, z5.s[2]\n" - "fmla z30.s, z14.s, z6.s[2]\n" - "ble 96f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "fmla z26.s, z15.s, z2.s[3]\n" - "fmla z27.s, z15.s, z3.s[3]\n" - "fmla z28.s, z15.s, z4.s[3]\n" - "fmla z29.s, z15.s, z5.s[3]\n" - "fmla z30.s, z15.s, z6.s[3]\n" - "96:" // Height 7: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "fmla z30.s, z8.s, z6.s[0]\n" + "add x21, x21, #0x10\n" + "addvl x12, x12, #1\n" + "ble 89f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "fmla z29.s, z9.s, z5.s[1]\n" + "fmla z30.s, z9.s, z6.s[1]\n" + "ble 89f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z29.s, z10.s, z5.s[2]\n" + "fmla z30.s, z10.s, z6.s[2]\n" + "ble 89f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z29.s, z11.s, z5.s[3]\n" + "fmla z30.s, z11.s, z6.s[3]\n" + "89:" // Height 7: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 91b\n" - "tbz %x[flags], #1, 97f\n" + "cmp x9, x19\n" + "bne 84b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "tbz %x[flags], #1, 90f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1371,86 +1306,62 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmin z30.s, p2/M, z30.s, z16.s\n" "fmax z29.s, p2/M, z29.s, z17.s\n" "fmax z30.s, p2/M, z30.s, z17.s\n" - "97:" // Height 7: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "st1w { z26.s }, p1, [x11]\n" - "addvl x11, x11, #1\n" - "st1w { z27.s }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "st1w { z28.s }, p1, [x27]\n" - "addvl x27, x27, #1\n" - "st1w { z29.s }, p1, [x25]\n" - "addvl x25, x25, #1\n" - "st1w { z30.s }, p1, [x23]\n" - "addvl x23, x23, #1\n" - "98:" // Height 7: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 87b\n" - "b 114f\n" - "99:" // Height 8 - "ldr x6, [%x[args_ptr], %[offsetof_N]]\n" - "mov x8, %x[bias]\n" - "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 100f\n" - "ldr x17, [%x[output_ptr], #0x0]\n" - "add x17, x17, x19, LSL #2\n" - "ldr x13, [%x[output_ptr], #0x8]\n" - "ldr x11, [%x[output_ptr], #0x10]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x18]\n" - "ldr x27, [%x[output_ptr], #0x20]\n" - "add x11, x11, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x28]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x23, [%x[output_ptr], #0x30]\n" - "ldr x21, [%x[output_ptr], #0x38]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add %x[output_ptr], %x[output_ptr], #0x40\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 101f\n" - "100:" // Height 8: setup direct output - "mov x17, %x[output_ptr]\n" - "add x13, x17, x19, LSL #2\n" - "add x11, x13, x19, LSL #2\n" - "add x9, x11, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "101:" // Height 8: Column loop + "90:" // Height 7: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "st1w { z26.s }, p1, [x25]\n" + "st1w { z27.s }, p1, [x24]\n" + "st1w { z28.s }, p1, [x23]\n" + "st1w { z29.s }, p1, [x22]\n" + "st1w { z30.s }, p1, [x21]\n" + "91:" // Height 7: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 80b\n" + "b 106f\n" + "92:" // Height 8 + "ldr x13, [%x[args_ptr], %[offsetof_N]]\n" + "mov x11, %x[bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x10, %x[output_ptr]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x19, #0x20\n" + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "93:" // Height 8: Column loop "mov x19, #0x0\n" - "whilelt p1.s, x19, x6\n" - "cbz x8, 102f\n" - "ld1w { z24.s }, p2/Z, [x8]\n" + "whilelt p1.s, x19, x13\n" + "cbz x11, 94f\n" + "ld1w { z24.s }, p2/Z, [x11]\n" "mov z25.d, z24.d\n" - "addvl x8, x8, #1\n" + "addvl x11, x11, #1\n" "mov z26.d, z24.d\n" "mov z27.d, z24.d\n" "mov z28.d, z24.d\n" "mov z29.d, z24.d\n" "mov z30.d, z24.d\n" "mov z31.d, z24.d\n" - "b 104f\n" - "102:" // Height 8: no bias - "tbz %x[flags], #0, 103f\n" - "ld1w { z24.s }, p1/Z, [x17]\n" - "ld1w { z25.s }, p1/Z, [x13]\n" - "ld1w { z26.s }, p1/Z, [x11]\n" - "ld1w { z27.s }, p1/Z, [x9]\n" - "ld1w { z28.s }, p1/Z, [x27]\n" - "ld1w { z29.s }, p1/Z, [x25]\n" - "ld1w { z30.s }, p1/Z, [x23]\n" - "ld1w { z31.s }, p1/Z, [x21]\n" - "b 104f\n" - "103:" // Height 8: no accumulate + "b 96f\n" + "94:" // Height 8: no bias + "tbz %x[flags], #0, 95f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z24.s }, p1/Z, [x10]\n" + "add x26, x10, x19, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x26]\n" + "add x25, x26, x19, LSL #2\n" + "ld1w { z26.s }, p1/Z, [x25]\n" + "add x24, x25, x19, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x24]\n" + "add x23, x24, x19, LSL #2\n" + "ld1w { z28.s }, p1/Z, [x23]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z29.s }, p1/Z, [x22]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z30.s }, p1/Z, [x21]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z31.s }, p1/Z, [x20]\n" + "b 96f\n" + "95:" // Height 8: no accumulate "mov z24.b, #0x0\n" "mov z25.b, #0x0\n" "mov z26.b, #0x0\n" @@ -1459,94 +1370,94 @@ void sve_hybrid_fp32_mla_8x1VL ( "mov z29.b, #0x0\n" "mov z30.b, #0x0\n" "mov z31.b, #0x0\n" - "104:" // Height 8: setup done - "mov x16, #0x0\n" - "105:" // Height 8: String loop + "96:" // Height 8: setup done + "mov x9, #0x0\n" + "97:" // Height 8: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w15, [x20, x16, LSL #0x2]\n" - "tbz %x[flags], #3, 106f\n" - "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n" + "ldr w28, [x20, x9, LSL #0x2]\n" + "tbz %x[flags], #3, 98f\n" + "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x14, [x20, #0x0]\n" - "ldr x12, [x20, #0x8]\n" - "ldr x10, [x20, #0x10]\n" - "ldr x28, [x20, #0x18]\n" - "ldr x26, [x20, #0x20]\n" - "ldr x24, [x20, #0x28]\n" - "ldr x22, [x20, #0x30]\n" + "ldr x27, [x20, #0x0]\n" + "ldr x26, [x20, #0x8]\n" + "ldr x25, [x20, #0x10]\n" + "ldr x24, [x20, #0x18]\n" + "ldr x23, [x20, #0x20]\n" + "ldr x22, [x20, #0x28]\n" + "ldr x21, [x20, #0x30]\n" "ldr x20, [x20, #0x38]\n" - "cbnz x16, 107f\n" + "cbnz x9, 99f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x14, x14, x19, LSL #2\n" - "add x12, x12, x19, LSL #2\n" - "add x10, x10, x19, LSL #2\n" - "add x28, x28, x19, LSL #2\n" + "add x27, x27, x19, LSL #2\n" "add x26, x26, x19, LSL #2\n" + "add x25, x25, x19, LSL #2\n" "add x24, x24, x19, LSL #2\n" + "add x23, x23, x19, LSL #2\n" "add x22, x22, x19, LSL #2\n" + "add x21, x21, x19, LSL #2\n" "add x20, x20, x19, LSL #2\n" - "b 107f\n" - "106:" // Height 8: setup direct input - "mov x14, %x[input_ptr]\n" - "add x12, x14, x19, LSL #2\n" - "add x10, x12, x19, LSL #2\n" - "add x28, x10, x19, LSL #2\n" - "add x26, x28, x19, LSL #2\n" - "add x24, x26, x19, LSL #2\n" - "add x22, x24, x19, LSL #2\n" - "add x20, x22, x19, LSL #2\n" - "107:" // Height 8: input setup done - "cmp x15, #0x4\n" - "ble 109f\n" - "108:" // Height 8: Multiply loop: Main loop head - "ld1w { z8.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n" - "sub x15, x15, #0x4\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" + "b 99f\n" + "98:" // Height 8: setup direct input + "mov x27, %x[input_ptr]\n" + "add x26, x27, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "99:" // Height 8: input setup done + "cmp x28, #0x4\n" + "ble 101f\n" + "100:" // Height 8: Multiply loop: Main loop head + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" + "sub x28, x28, #0x4\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "fmla z24.s, z8.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" "fmla z25.s, z8.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" + "add x26, x26, #0x10\n" "fmla z24.s, z9.s, z0.s[1]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "fmla z26.s, z8.s, z2.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "fmla z27.s, z8.s, z3.s[0]\n" - "ld1rqw { z5.s }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "fmla z25.s, z9.s, z1.s[1]\n" - "ld1rqw { z6.s }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqw { z6.s }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "fmla z28.s, z8.s, z4.s[0]\n" "ld1rqw { z7.s }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "fmla z29.s, z8.s, z5.s[0]\n" - "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" "add x20, x20, #0x10\n" "fmla z30.s, z8.s, z6.s[0]\n" - "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n" - "cmp x15, #0x4\n" + "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "cmp x28, #0x4\n" "fmla z31.s, z8.s, z7.s[0]\n" - "prfm pldl1keep, [x14, #0x80]\n" - "addvl x7, x7, #4\n" + "prfm pldl1keep, [x27, #0x80]\n" + "addvl x12, x12, #4\n" "fmla z26.s, z9.s, z2.s[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla z27.s, z9.s, z3.s[1]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "fmla z28.s, z9.s, z4.s[1]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "fmla z29.s, z9.s, z5.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "fmla z30.s, z9.s, z6.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla z31.s, z9.s, z7.s[1]\n" "prfm pldl1keep, [x22, #0x80]\n" + "fmla z31.s, z9.s, z7.s[1]\n" + "prfm pldl1keep, [x21, #0x80]\n" "fmla z24.s, z10.s, z0.s[2]\n" "prfm pldl1keep, [x20, #0x80]\n" "fmla z25.s, z10.s, z1.s[2]\n" @@ -1564,85 +1475,93 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmla z29.s, z11.s, z5.s[3]\n" "fmla z30.s, z11.s, z6.s[3]\n" "fmla z31.s, z11.s, z7.s[3]\n" - "bgt 108b\n" - "109:" // Height 8: Multiply loop: Single iteration only - "ld1w { z12.s }, p2/Z, [x7]\n" - "whilelt p0.s, XZR, x15\n" - "subs x15, x15, #0x1\n" - "ld1rqw { z0.s }, p0/Z, [x14]\n" - "fmla z24.s, z12.s, z0.s[0]\n" - "ld1rqw { z1.s }, p0/Z, [x12]\n" - "add x14, x14, #0x10\n" - "fmla z25.s, z12.s, z1.s[0]\n" - "ld1rqw { z2.s }, p0/Z, [x10]\n" - "add x12, x12, #0x10\n" - "fmla z26.s, z12.s, z2.s[0]\n" - "ld1rqw { z3.s }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" - "fmla z27.s, z12.s, z3.s[0]\n" - "ld1rqw { z4.s }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" - "fmla z28.s, z12.s, z4.s[0]\n" - "ld1rqw { z5.s }, p0/Z, [x24]\n" + "bgt 100b\n" + "101:" // Height 8: Multiply loop: Single iteration only + "ld1w { z8.s }, p2/Z, [x12]\n" + "whilelt p0.s, XZR, x28\n" + "subs x28, x28, #0x1\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" + "fmla z24.s, z8.s, z0.s[0]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "add x27, x27, #0x10\n" + "fmla z25.s, z8.s, z1.s[0]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "add x26, x26, #0x10\n" - "fmla z29.s, z12.s, z5.s[0]\n" - "ld1rqw { z6.s }, p0/Z, [x22]\n" + "fmla z26.s, z8.s, z2.s[0]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" + "fmla z27.s, z8.s, z3.s[0]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" - "fmla z30.s, z12.s, z6.s[0]\n" - "ld1rqw { z7.s }, p0/Z, [x20]\n" + "fmla z28.s, z8.s, z4.s[0]\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "fmla z29.s, z8.s, z5.s[0]\n" + "ld1rqw { z6.s }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" - "fmla z31.s, z12.s, z7.s[0]\n" + "fmla z30.s, z8.s, z6.s[0]\n" + "ld1rqw { z7.s }, p0/Z, [x20]\n" + "add x21, x21, #0x10\n" + "fmla z31.s, z8.s, z7.s[0]\n" "add x20, x20, #0x10\n" - "addvl x7, x7, #1\n" - "ble 110f\n" - "ld1w { z13.s }, p2/Z, [x7]\n" - "fmla z24.s, z13.s, z0.s[1]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z13.s, z1.s[1]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z13.s, z2.s[1]\n" - "fmla z27.s, z13.s, z3.s[1]\n" - "fmla z28.s, z13.s, z4.s[1]\n" - "fmla z29.s, z13.s, z5.s[1]\n" - "fmla z30.s, z13.s, z6.s[1]\n" - "fmla z31.s, z13.s, z7.s[1]\n" - "ble 110f\n" - "ld1w { z14.s }, p2/Z, [x7]\n" - "fmla z24.s, z14.s, z0.s[2]\n" - "subs x15, x15, #0x1\n" - "fmla z25.s, z14.s, z1.s[2]\n" - "addvl x7, x7, #1\n" - "fmla z26.s, z14.s, z2.s[2]\n" - "fmla z27.s, z14.s, z3.s[2]\n" - "fmla z28.s, z14.s, z4.s[2]\n" - "fmla z29.s, z14.s, z5.s[2]\n" - "fmla z30.s, z14.s, z6.s[2]\n" - "fmla z31.s, z14.s, z7.s[2]\n" - "ble 110f\n" - "ld1w { z15.s }, p2/Z, [x7]\n" - "fmla z24.s, z15.s, z0.s[3]\n" - "addvl x7, x7, #1\n" - "fmla z25.s, z15.s, z1.s[3]\n" - "fmla z26.s, z15.s, z2.s[3]\n" - "fmla z27.s, z15.s, z3.s[3]\n" - "fmla z28.s, z15.s, z4.s[3]\n" - "fmla z29.s, z15.s, z5.s[3]\n" - "fmla z30.s, z15.s, z6.s[3]\n" - "fmla z31.s, z15.s, z7.s[3]\n" - "110:" // Height 8: Multiply loop: multiply skip - "prfm pldl1keep, [x14, #0x80]\n" - "add x16, x16, #0x1\n" - "prfm pldl1keep, [x12, #0x80]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "addvl x12, x12, #1\n" + "ble 102f\n" + "ld1w { z9.s }, p2/Z, [x12]\n" + "fmla z24.s, z9.s, z0.s[1]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z9.s, z1.s[1]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z28.s, z9.s, z4.s[1]\n" + "fmla z29.s, z9.s, z5.s[1]\n" + "fmla z30.s, z9.s, z6.s[1]\n" + "fmla z31.s, z9.s, z7.s[1]\n" + "ble 102f\n" + "ld1w { z10.s }, p2/Z, [x12]\n" + "fmla z24.s, z10.s, z0.s[2]\n" + "subs x28, x28, #0x1\n" + "fmla z25.s, z10.s, z1.s[2]\n" + "addvl x12, x12, #1\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z29.s, z10.s, z5.s[2]\n" + "fmla z30.s, z10.s, z6.s[2]\n" + "fmla z31.s, z10.s, z7.s[2]\n" + "ble 102f\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z24.s, z11.s, z0.s[3]\n" + "addvl x12, x12, #1\n" + "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z29.s, z11.s, z5.s[3]\n" + "fmla z30.s, z11.s, z6.s[3]\n" + "fmla z31.s, z11.s, z7.s[3]\n" + "102:" // Height 8: Multiply loop: multiply skip + "prfm pldl1keep, [x27, #0x80]\n" + "add x9, x9, #0x1\n" "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x16, x19\n" - "bne 105b\n" - "tbz %x[flags], #1, 111f\n" + "cmp x9, x19\n" + "bne 97b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x26, x10, x19, LSL #2\n" + "add x25, x26, x19, LSL #2\n" + "add x24, x25, x19, LSL #2\n" + "add x23, x24, x19, LSL #2\n" + "add x22, x23, x19, LSL #2\n" + "add x21, x22, x19, LSL #2\n" + "add x20, x21, x19, LSL #2\n" + "tbz %x[flags], #1, 103f\n" "add x19, %x[args_ptr], %[offset_min]\n" "ld1rw { z17.s }, p2/Z, [x19]\n" "add x19, %x[args_ptr], %[offset_max]\n" @@ -1663,43 +1582,36 @@ void sve_hybrid_fp32_mla_8x1VL ( "fmax z29.s, p2/M, z29.s, z17.s\n" "fmax z30.s, p2/M, z30.s, z17.s\n" "fmax z31.s, p2/M, z31.s, z17.s\n" - "111:" // Height 8: No activation - "st1w { z24.s }, p1, [x17]\n" - "addvl x17, x17, #1\n" - "st1w { z25.s }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "st1w { z26.s }, p1, [x11]\n" - "addvl x11, x11, #1\n" - "st1w { z27.s }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "st1w { z28.s }, p1, [x27]\n" - "addvl x27, x27, #1\n" - "st1w { z29.s }, p1, [x25]\n" - "addvl x25, x25, #1\n" - "st1w { z30.s }, p1, [x23]\n" - "addvl x23, x23, #1\n" - "st1w { z31.s }, p1, [x21]\n" - "addvl x21, x21, #1\n" - "112:" // Height 8: Writeback done - "decw x6\n" - "cmp x6, XZR\n" - "bgt 101b\n" + "103:" // Height 8: No activation + "st1w { z24.s }, p1, [x10]\n" + "addvl x10, x10, #1\n" + "st1w { z25.s }, p1, [x26]\n" + "st1w { z26.s }, p1, [x25]\n" + "st1w { z27.s }, p1, [x24]\n" + "st1w { z28.s }, p1, [x23]\n" + "st1w { z29.s }, p1, [x22]\n" + "st1w { z30.s }, p1, [x21]\n" + "st1w { z31.s }, p1, [x20]\n" + "104:" // Height 8: Writeback done + "decw x13\n" + "cmp x13, XZR\n" + "bgt 93b\n" "subs %x[M], %x[M], #0x8\n" - "beq 114f\n" + "beq 106f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 113f\n" + "tbz %x[flags], #3, 105f\n" "add x20, x20, #0x8\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "113:" // Update direct input + "105:" // Update direct input "mov x19, #0x20\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "114:" // Exit + "106:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp index b4d3f0283d..bc93ced25b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_s8qa_dot_4x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_s8qa_dot_4x4VL; - cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp index 3c778bfe94..50b9ba524d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp @@ -81,152 +81,143 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 46f\n" + "bge 43f\n" "cmp %x[M], #0x2\n" - "bgt 31f\n" - "beq 16f\n" + "bgt 29f\n" + "beq 15f\n" "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x10, %x[col_bias]\n" - "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "mov z14.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 2f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "add x9, x9, x19\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x9, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "4:" // Height 1: setup done - "mov x28, #0x0\n" - "5:" // Height 1: String loop + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 6f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "cbnz x28, 7f\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "b 7f\n" - "6:" // Height 1: setup direct input - "mov x26, %x[input_ptr]\n" - "7:" // Height 1: input setup done - "cmp x27, #0x10\n" - "ble 10f\n" - "8:" // Height 1: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "ble 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x26, x26, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z18.s, z6.b, z0.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z19.s, z7.b, z0.b[0]\n" "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z17.s, z9.b, z0.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z18.s, z10.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z19.s, z4.b, z0.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z16.s, z5.b, z0.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z17.s, z6.b, z0.b[2]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z18.s, z7.b, z0.b[2]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z19.s, z8.b, z0.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z16.s, z9.b, z0.b[3]\n" "sdot z17.s, z10.b, z0.b[3]\n" "sdot z18.s, z4.b, z0.b[3]\n" "sdot z19.s, z5.b, z0.b[3]\n" - "tbnz %x[flags], #31, 9f\n" + "tbnz %x[flags], #31, 8f\n" "sdot z11.s, z0.b, z15.b\n" - "9:" // Height 1: Multiply loop: unique 1: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "cmp x27, #0x10\n" - "bgt 8b\n" - "10:" // Height 1: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "sdot z16.s, z6.b, z0.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x26, x26, #0x10\n" - "sdot z17.s, z7.b, z0.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z18.s, z8.b, z0.b[0]\n" - "sdot z19.s, z9.b, z0.b[0]\n" - "ble 11f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "sdot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z17.s, z4.b, z0.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "sdot z18.s, z5.b, z0.b[1]\n" - "addvl x11, x11, #4\n" - "sdot z19.s, z6.b, z0.b[1]\n" - "ble 11f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "sdot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z17.s, z8.b, z0.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "sdot z18.s, z9.b, z0.b[2]\n" - "addvl x11, x11, #4\n" - "sdot z19.s, z10.b, z0.b[2]\n" - "ble 11f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "sdot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z17.s, z5.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z18.s, z6.b, z0.b[3]\n" - "sdot z19.s, z7.b, z0.b[3]\n" - "11:" // Height 1: Multiply loop: multiply skip - "tbnz %x[flags], #31, 12f\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x23, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "ble 10f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "addvl x28, x28, #4\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "ble 10f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "addvl x28, x28, #4\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "ble 10f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "10:" // Height 1: Multiply loop: multiply skip + "tbnz %x[flags], #31, 11f\n" "sdot z11.s, z0.b, z15.b\n" - "12:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" + "11:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 5b\n" - "tbnz %x[flags], #31, 13f\n" + "cmp x25, x19\n" + "bne 4b\n" + "tbnz %x[flags], #31, 12f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z1.s }, p2/Z, [x19]\n" "neg z1.s, p2/M, z1.s\n" @@ -235,21 +226,21 @@ void sve_hybrid_s8qa_dot_4x4VL ( "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "mul z11.s, p2/M, z11.s, z1.s\n" - "13:" // Height 1: skip row sum fixup + "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z16.s, z16.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z17.s, z17.s, z1.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z18.s, z18.s, z2.s\n" @@ -258,7 +249,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - "tbz %x[flags], #5, 14f\n" + "tbz %x[flags], #5, 13f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -271,7 +262,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sqadd z17.s, z17.s, z5.s\n" "sqadd z18.s, z18.s, z6.s\n" "sqadd z19.s, z19.s, z7.s\n" - "14:" // Height 1: no shift correction + "13:" // Height 1: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -297,107 +288,96 @@ void sve_hybrid_s8qa_dot_4x4VL ( "uzp1 z16.h, z16.h, z17.h\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "15:" // Height 1: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 3b\n" - "b 62f\n" - "16:" // Height 2 + "st1b { z16.b }, p1, [x26]\n" + "addvl x26, x26, #1\n" + "14:" // Height 1: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 2b\n" + "b 58f\n" + "15:" // Height 2 "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" - "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "mov z14.s, #0x0\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 17f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "add x25, x25, x19\n" - "b 18f\n" - "17:" // Height 2: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "18:" // Height 2: Column loop + "mov x26, %x[output_ptr]\n" + "16:" // Height 2: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "19:" // Height 2: setup done - "mov x28, #0x0\n" - "20:" // Height 2: String loop + "17:" // Height 2: setup done + "mov x25, #0x0\n" + "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 21f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "cbnz x28, 22f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 20f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" - "b 22f\n" - "21:" // Height 2: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "22:" // Height 2: input setup done - "cmp x27, #0x10\n" - "ble 25f\n" - "23:" // Height 2: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "20:" // Height 2: input setup done + "cmp x24, #0x10\n" + "ble 23f\n" + "21:" // Height 2: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x24, x24, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x22, #0x10\n" "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z21.s, z5.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z18.s, z6.b, z0.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z22.s, z6.b, z1.b[0]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z19.s, z7.b, z0.b[0]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z23.s, z7.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z20.s, z8.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z17.s, z9.b, z0.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z21.s, z9.b, z1.b[1]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z18.s, z10.b, z0.b[1]\n" "sdot z22.s, z10.b, z1.b[1]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z19.s, z4.b, z0.b[1]\n" "sdot z23.s, z4.b, z1.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z16.s, z5.b, z0.b[2]\n" "sdot z20.s, z5.b, z1.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z17.s, z6.b, z0.b[2]\n" "sdot z21.s, z6.b, z1.b[2]\n" "sdot z18.s, z7.b, z0.b[2]\n" @@ -412,91 +392,93 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z22.s, z4.b, z1.b[3]\n" "sdot z19.s, z5.b, z0.b[3]\n" "sdot z23.s, z5.b, z1.b[3]\n" - "tbnz %x[flags], #31, 24f\n" + "tbnz %x[flags], #31, 22f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" - "24:" // Height 2: Multiply loop: unique 3: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x10\n" - "bgt 23b\n" - "25:" // Height 2: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "sdot z16.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" - "sdot z17.s, z7.b, z0.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x24, x24, #0x10\n" - "sdot z20.s, z6.b, z1.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z21.s, z7.b, z1.b[0]\n" - "sdot z18.s, z8.b, z0.b[0]\n" - "sdot z22.s, z8.b, z1.b[0]\n" - "sdot z19.s, z9.b, z0.b[0]\n" - "sdot z23.s, z9.b, z1.b[0]\n" - "ble 26f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "sdot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z20.s, z10.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "sdot z17.s, z4.b, z0.b[1]\n" - "addvl x11, x11, #4\n" - "sdot z21.s, z4.b, z1.b[1]\n" - "sdot z18.s, z5.b, z0.b[1]\n" - "sdot z22.s, z5.b, z1.b[1]\n" - "sdot z19.s, z6.b, z0.b[1]\n" - "sdot z23.s, z6.b, z1.b[1]\n" - "ble 26f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "sdot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z20.s, z7.b, z1.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "sdot z17.s, z8.b, z0.b[2]\n" - "addvl x11, x11, #4\n" - "sdot z21.s, z8.b, z1.b[2]\n" - "sdot z18.s, z9.b, z0.b[2]\n" - "sdot z22.s, z9.b, z1.b[2]\n" - "sdot z19.s, z10.b, z0.b[2]\n" - "sdot z23.s, z10.b, z1.b[2]\n" - "ble 26f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "sdot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "sdot z20.s, z4.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "sdot z17.s, z5.b, z0.b[3]\n" - "addvl x11, x11, #4\n" - "sdot z21.s, z5.b, z1.b[3]\n" - "sdot z18.s, z6.b, z0.b[3]\n" - "sdot z22.s, z6.b, z1.b[3]\n" - "sdot z19.s, z7.b, z0.b[3]\n" - "sdot z23.s, z7.b, z1.b[3]\n" - "26:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 27f\n" + "22:" // Height 2: Multiply loop: unique 3: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x10\n" + "bgt 21b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "ble 24f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "addvl x28, x28, #4\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "ble 24f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "addvl x28, x28, #4\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "ble 24f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "addvl x28, x28, #4\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "24:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 25f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" - "27:" // Height 2: Multiply loop: unique 4: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" + "25:" // Height 2: Multiply loop: unique 4: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 20b\n" - "tbnz %x[flags], #31, 28f\n" + "cmp x25, x19\n" + "bne 18b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19\n" + "tbnz %x[flags], #31, 26f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z2.s }, p2/Z, [x19]\n" "neg z2.s, p2/M, z2.s\n" @@ -508,19 +490,19 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z12.s, z12.s[0]\n" "mul z11.s, p2/M, z11.s, z2.s\n" "mul z12.s, p2/M, z12.s, z2.s\n" - "28:" // Height 2: skip row sum fixup + "26:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z20.s, z20.s, z12.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z21.s, z21.s, z12.s\n" @@ -531,7 +513,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "add z18.s, z18.s, z2.s\n" "add z19.s, z19.s, z3.s\n" "add z20.s, z20.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z21.s, z21.s, z1.s\n" "add z22.s, z22.s, z2.s\n" "add z23.s, z23.s, z3.s\n" @@ -543,7 +525,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - "tbz %x[flags], #5, 29f\n" + "tbz %x[flags], #5, 27f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -568,7 +550,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sqadd z21.s, z21.s, z9.s\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" - "29:" // Height 2: no shift correction + "27:" // Height 2: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -599,9 +581,9 @@ void sve_hybrid_s8qa_dot_4x4VL ( "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" + "st1b { z16.b }, p1, [x26]\n" "add z21.s, z21.s, z4.s\n" - "addvl x9, x9, #1\n" + "addvl x26, x26, #1\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" "smin z21.s, p2/M, z21.s, z6.s\n" @@ -615,41 +597,27 @@ void sve_hybrid_s8qa_dot_4x4VL ( "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z21.h, z22.h, z23.h\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" - "addvl x25, x25, #1\n" - "30:" // Height 2: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 18b\n" - "b 62f\n" - "31:" // Height 3 + "st1b { z20.b }, p1, [x22]\n" + "28:" // Height 2: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 16b\n" + "b 58f\n" + "29:" // Height 3 "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "mov z14.s, #0x0\n" + "mov x26, %x[output_ptr]\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 32f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "b 33f\n" - "32:" // Height 3: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "33:" // Height 3: Column loop + "30:" // Height 3: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" @@ -660,83 +628,83 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "34:" // Height 3: setup done - "mov x28, #0x0\n" - "35:" // Height 3: String loop + "31:" // Height 3: setup done + "mov x25, #0x0\n" + "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 36f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" - "cbnz x28, 37f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 34f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 37f\n" - "36:" // Height 3: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "37:" // Height 3: input setup done - "cmp x27, #0x10\n" - "ble 40f\n" - "38:" // Height 3: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "add x21, x21, x19\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "34:" // Height 3: input setup done + "cmp x24, #0x10\n" + "ble 37f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x21, x21, #0x10\n" "sdot z24.s, z4.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z21.s, z5.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z25.s, z5.b, z2.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z18.s, z6.b, z0.b[0]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z22.s, z6.b, z1.b[0]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z26.s, z6.b, z2.b[0]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z19.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z23.s, z7.b, z1.b[0]\n" "sdot z27.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z16.s, z8.b, z0.b[1]\n" "sdot z20.s, z8.b, z1.b[1]\n" "sdot z24.s, z8.b, z2.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z17.s, z9.b, z0.b[1]\n" "sdot z21.s, z9.b, z1.b[1]\n" "sdot z25.s, z9.b, z2.b[1]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z18.s, z10.b, z0.b[1]\n" "sdot z22.s, z10.b, z1.b[1]\n" "sdot z26.s, z10.b, z2.b[1]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z19.s, z4.b, z0.b[1]\n" "sdot z23.s, z4.b, z1.b[1]\n" "sdot z27.s, z4.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z16.s, z5.b, z0.b[2]\n" "sdot z20.s, z5.b, z1.b[2]\n" "sdot z24.s, z5.b, z2.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z17.s, z6.b, z0.b[2]\n" "sdot z21.s, z6.b, z1.b[2]\n" "sdot z25.s, z6.b, z2.b[2]\n" @@ -758,113 +726,116 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z19.s, z5.b, z0.b[3]\n" "sdot z23.s, z5.b, z1.b[3]\n" "sdot z27.s, z5.b, z2.b[3]\n" - "tbnz %x[flags], #31, 39f\n" + "tbnz %x[flags], #31, 36f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 5: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x10\n" + "36:" // Height 3: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" "prfm pldl1keep, [x22, #0x80]\n" - "bgt 38b\n" - "40:" // Height 3: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "sdot z16.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" - "sdot z17.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z20.s, z6.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "cmp x24, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "bgt 35b\n" + "37:" // Height 3: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" - "sdot z24.s, z6.b, z2.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z21.s, z7.b, z1.b[0]\n" - "sdot z25.s, z7.b, z2.b[0]\n" - "sdot z18.s, z8.b, z0.b[0]\n" - "sdot z22.s, z8.b, z1.b[0]\n" - "sdot z26.s, z8.b, z2.b[0]\n" - "sdot z19.s, z9.b, z0.b[0]\n" - "sdot z23.s, z9.b, z1.b[0]\n" - "sdot z27.s, z9.b, z2.b[0]\n" - "ble 41f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "sdot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z20.s, z10.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z24.s, z10.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z17.s, z4.b, z0.b[1]\n" - "sdot z21.s, z4.b, z1.b[1]\n" - "sdot z25.s, z4.b, z2.b[1]\n" - "sdot z18.s, z5.b, z0.b[1]\n" - "sdot z22.s, z5.b, z1.b[1]\n" - "sdot z26.s, z5.b, z2.b[1]\n" - "sdot z19.s, z6.b, z0.b[1]\n" - "sdot z23.s, z6.b, z1.b[1]\n" - "sdot z27.s, z6.b, z2.b[1]\n" - "ble 41f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "sdot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z20.s, z7.b, z1.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z24.s, z7.b, z2.b[2]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z17.s, z8.b, z0.b[2]\n" - "sdot z21.s, z8.b, z1.b[2]\n" - "sdot z25.s, z8.b, z2.b[2]\n" - "sdot z18.s, z9.b, z0.b[2]\n" - "sdot z22.s, z9.b, z1.b[2]\n" - "sdot z26.s, z9.b, z2.b[2]\n" - "sdot z19.s, z10.b, z0.b[2]\n" - "sdot z23.s, z10.b, z1.b[2]\n" - "sdot z27.s, z10.b, z2.b[2]\n" - "ble 41f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "sdot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "sdot z20.s, z4.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z24.s, z4.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z17.s, z5.b, z0.b[3]\n" - "sdot z21.s, z5.b, z1.b[3]\n" - "sdot z25.s, z5.b, z2.b[3]\n" - "sdot z18.s, z6.b, z0.b[3]\n" - "sdot z22.s, z6.b, z1.b[3]\n" - "sdot z26.s, z6.b, z2.b[3]\n" - "sdot z19.s, z7.b, z0.b[3]\n" - "sdot z23.s, z7.b, z1.b[3]\n" - "sdot z27.s, z7.b, z2.b[3]\n" - "41:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 42f\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x21, x21, #0x10\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "ble 38f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "ble 38f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "ble 38f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "38:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 39f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" - "42:" // Height 3: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" + "39:" // Height 3: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 35b\n" - "tbnz %x[flags], #31, 43f\n" + "cmp x25, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19\n" + "add x21, x22, x19\n" + "tbnz %x[flags], #31, 40f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z3.s }, p2/Z, [x19]\n" "neg z3.s, p2/M, z3.s\n" @@ -879,19 +850,19 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mul z11.s, p2/M, z11.s, z3.s\n" "mul z12.s, p2/M, z12.s, z3.s\n" "mul z13.s, p2/M, z13.s, z3.s\n" - "43:" // Height 3: skip row sum fixup + "40:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z20.s, z20.s, z12.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z21.s, z21.s, z12.s\n" @@ -910,7 +881,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "add z22.s, z22.s, z2.s\n" "add z23.s, z23.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z25.s, z25.s, z1.s\n" "add z26.s, z26.s, z2.s\n" "add z27.s, z27.s, z3.s\n" @@ -926,7 +897,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" - "tbz %x[flags], #5, 44f\n" + "tbz %x[flags], #5, 41f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -963,7 +934,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "asr z8.s, z8.s, #0x1f\n" "sqadd z26.s, z26.s, z7.s\n" "sqadd z27.s, z27.s, z8.s\n" - "44:" // Height 3: no shift correction + "41:" // Height 3: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -994,9 +965,9 @@ void sve_hybrid_s8qa_dot_4x4VL ( "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" + "st1b { z16.b }, p1, [x26]\n" "add z21.s, z21.s, z4.s\n" - "addvl x9, x9, #1\n" + "addvl x26, x26, #1\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" @@ -1019,58 +990,42 @@ void sve_hybrid_s8qa_dot_4x4VL ( "uzp1 z21.h, z22.h, z23.h\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" + "st1b { z20.b }, p1, [x22]\n" "add z26.s, z26.s, z4.s\n" - "addvl x25, x25, #1\n" - "add z27.s, z27.s, z4.s\n" "smax z25.s, p2/M, z25.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" "smax z26.s, p2/M, z26.s, z5.s\n" "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z25.h, z26.h, z27.h\n" "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x23]\n" - "addvl x23, x23, #1\n" - "45:" // Height 3: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 33b\n" - "b 62f\n" - "46:" // Height 4 + "st1b { z24.b }, p1, [x21]\n" + "42:" // Height 3: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 30b\n" + "b 58f\n" + "43:" // Height 4 "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" "mov z14.s, #0x0\n" + "mov x19, #0x4\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 47f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "ldr x21, [%x[output_ptr], #0x18]\n" - "add x25, x25, x19\n" - "add %x[output_ptr], %x[output_ptr], #0x20\n" - "add x23, x23, x19\n" - "add x21, x21, x19\n" - "b 48f\n" - "47:" // Height 4: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "add x21, x23, x19\n" - "add %x[output_ptr], x21, x19\n" - "48:" // Height 4: Column loop + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "44:" // Height 4: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" @@ -1085,97 +1040,97 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "49:" // Height 4: setup done - "mov x28, #0x0\n" - "50:" // Height 4: String loop + "45:" // Height 4: setup done + "mov x25, #0x0\n" + "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 51f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 47f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" "ldr x20, [x20, #0x18]\n" - "cbnz x28, 52f\n" + "cbnz x25, 48f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 52f\n" - "51:" // Height 4: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "52:" // Height 4: input setup done - "cmp x27, #0x10\n" - "ble 55f\n" - "53:" // Height 4: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "b 48f\n" + "47:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "48:" // Height 4: input setup done + "cmp x24, #0x10\n" + "ble 51f\n" + "49:" // Height 4: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "sdot z16.s, z4.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z17.s, z5.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "sdot z20.s, z4.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "sdot z24.s, z4.b, z2.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "add x20, x20, #0x10\n" "sdot z21.s, z5.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z25.s, z5.b, z2.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z28.s, z4.b, z3.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z29.s, z5.b, z3.b[0]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z18.s, z6.b, z0.b[0]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z22.s, z6.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z26.s, z6.b, z2.b[0]\n" "sdot z30.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z19.s, z7.b, z0.b[0]\n" "sdot z23.s, z7.b, z1.b[0]\n" "sdot z27.s, z7.b, z2.b[0]\n" "sdot z31.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z16.s, z8.b, z0.b[1]\n" "sdot z20.s, z8.b, z1.b[1]\n" "sdot z24.s, z8.b, z2.b[1]\n" "sdot z28.s, z8.b, z3.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z17.s, z9.b, z0.b[1]\n" "sdot z21.s, z9.b, z1.b[1]\n" "sdot z25.s, z9.b, z2.b[1]\n" "sdot z29.s, z9.b, z3.b[1]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z18.s, z10.b, z0.b[1]\n" "sdot z22.s, z10.b, z1.b[1]\n" "sdot z26.s, z10.b, z2.b[1]\n" "sdot z30.s, z10.b, z3.b[1]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z19.s, z4.b, z0.b[1]\n" "sdot z23.s, z4.b, z1.b[1]\n" "sdot z27.s, z4.b, z2.b[1]\n" "sdot z31.s, z4.b, z3.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z16.s, z5.b, z0.b[2]\n" "sdot z20.s, z5.b, z1.b[2]\n" "sdot z24.s, z5.b, z2.b[2]\n" "sdot z28.s, z5.b, z3.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z17.s, z6.b, z0.b[2]\n" "sdot z21.s, z6.b, z1.b[2]\n" "sdot z25.s, z6.b, z2.b[2]\n" @@ -1204,135 +1159,139 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sdot z23.s, z5.b, z1.b[3]\n" "sdot z27.s, z5.b, z2.b[3]\n" "sdot z31.s, z5.b, z3.b[3]\n" - "tbnz %x[flags], #31, 54f\n" + "tbnz %x[flags], #31, 50f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z14.s, z3.b, z15.b\n" - "54:" // Height 4: Multiply loop: unique 7: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x10\n" + "50:" // Height 4: Multiply loop: unique 7: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" - "bgt 53b\n" - "55:" // Height 4: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "sdot z16.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" - "sdot z17.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z20.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x20]\n" + "bgt 49b\n" + "51:" // Height 4: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "sdot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "sdot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" - "sdot z24.s, z6.b, z2.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "sdot z20.s, z4.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x20]\n" + "add x21, x21, #0x10\n" + "sdot z24.s, z4.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "add x20, x20, #0x10\n" - "sdot z21.s, z7.b, z1.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z28.s, z6.b, z3.b[0]\n" - "sdot z25.s, z7.b, z2.b[0]\n" - "sdot z29.s, z7.b, z3.b[0]\n" - "sdot z18.s, z8.b, z0.b[0]\n" - "sdot z22.s, z8.b, z1.b[0]\n" - "sdot z26.s, z8.b, z2.b[0]\n" - "sdot z30.s, z8.b, z3.b[0]\n" - "sdot z19.s, z9.b, z0.b[0]\n" - "sdot z23.s, z9.b, z1.b[0]\n" - "sdot z27.s, z9.b, z2.b[0]\n" - "sdot z31.s, z9.b, z3.b[0]\n" - "ble 56f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "sdot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z20.s, z10.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z24.s, z10.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z28.s, z10.b, z3.b[1]\n" - "sdot z17.s, z4.b, z0.b[1]\n" - "sdot z21.s, z4.b, z1.b[1]\n" - "sdot z25.s, z4.b, z2.b[1]\n" - "sdot z29.s, z4.b, z3.b[1]\n" - "sdot z18.s, z5.b, z0.b[1]\n" - "sdot z22.s, z5.b, z1.b[1]\n" - "sdot z26.s, z5.b, z2.b[1]\n" - "sdot z30.s, z5.b, z3.b[1]\n" - "sdot z19.s, z6.b, z0.b[1]\n" - "sdot z23.s, z6.b, z1.b[1]\n" - "sdot z27.s, z6.b, z2.b[1]\n" - "sdot z31.s, z6.b, z3.b[1]\n" - "ble 56f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "sdot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "sdot z20.s, z7.b, z1.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z24.s, z7.b, z2.b[2]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z28.s, z7.b, z3.b[2]\n" - "sdot z17.s, z8.b, z0.b[2]\n" - "sdot z21.s, z8.b, z1.b[2]\n" - "sdot z25.s, z8.b, z2.b[2]\n" - "sdot z29.s, z8.b, z3.b[2]\n" - "sdot z18.s, z9.b, z0.b[2]\n" - "sdot z22.s, z9.b, z1.b[2]\n" - "sdot z26.s, z9.b, z2.b[2]\n" - "sdot z30.s, z9.b, z3.b[2]\n" - "sdot z19.s, z10.b, z0.b[2]\n" - "sdot z23.s, z10.b, z1.b[2]\n" - "sdot z27.s, z10.b, z2.b[2]\n" - "sdot z31.s, z10.b, z3.b[2]\n" - "ble 56f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "sdot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "sdot z20.s, z4.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "sdot z24.s, z4.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "sdot z28.s, z4.b, z3.b[3]\n" - "sdot z17.s, z5.b, z0.b[3]\n" - "sdot z21.s, z5.b, z1.b[3]\n" - "sdot z25.s, z5.b, z2.b[3]\n" - "sdot z29.s, z5.b, z3.b[3]\n" - "sdot z18.s, z6.b, z0.b[3]\n" - "sdot z22.s, z6.b, z1.b[3]\n" - "sdot z26.s, z6.b, z2.b[3]\n" - "sdot z30.s, z6.b, z3.b[3]\n" - "sdot z19.s, z7.b, z0.b[3]\n" - "sdot z23.s, z7.b, z1.b[3]\n" - "sdot z27.s, z7.b, z2.b[3]\n" - "sdot z31.s, z7.b, z3.b[3]\n" - "56:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 57f\n" + "sdot z21.s, z5.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z28.s, z4.b, z3.b[0]\n" + "sdot z25.s, z5.b, z2.b[0]\n" + "sdot z29.s, z5.b, z3.b[0]\n" + "sdot z18.s, z6.b, z0.b[0]\n" + "sdot z22.s, z6.b, z1.b[0]\n" + "sdot z26.s, z6.b, z2.b[0]\n" + "sdot z30.s, z6.b, z3.b[0]\n" + "sdot z19.s, z7.b, z0.b[0]\n" + "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z27.s, z7.b, z2.b[0]\n" + "sdot z31.s, z7.b, z3.b[0]\n" + "ble 52f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "sdot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z20.s, z8.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z8.b, z2.b[1]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z28.s, z8.b, z3.b[1]\n" + "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z2.b[1]\n" + "sdot z29.s, z9.b, z3.b[1]\n" + "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z22.s, z10.b, z1.b[1]\n" + "sdot z26.s, z10.b, z2.b[1]\n" + "sdot z30.s, z10.b, z3.b[1]\n" + "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z27.s, z4.b, z2.b[1]\n" + "sdot z31.s, z4.b, z3.b[1]\n" + "ble 52f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "sdot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "sdot z20.s, z5.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z5.b, z2.b[2]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z28.s, z5.b, z3.b[2]\n" + "sdot z17.s, z6.b, z0.b[2]\n" + "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z25.s, z6.b, z2.b[2]\n" + "sdot z29.s, z6.b, z3.b[2]\n" + "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z22.s, z7.b, z1.b[2]\n" + "sdot z26.s, z7.b, z2.b[2]\n" + "sdot z30.s, z7.b, z3.b[2]\n" + "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z27.s, z8.b, z2.b[2]\n" + "sdot z31.s, z8.b, z3.b[2]\n" + "ble 52f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "sdot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z20.s, z9.b, z1.b[3]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z9.b, z2.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "sdot z28.s, z9.b, z3.b[3]\n" + "sdot z17.s, z10.b, z0.b[3]\n" + "sdot z21.s, z10.b, z1.b[3]\n" + "sdot z25.s, z10.b, z2.b[3]\n" + "sdot z29.s, z10.b, z3.b[3]\n" + "sdot z18.s, z4.b, z0.b[3]\n" + "sdot z22.s, z4.b, z1.b[3]\n" + "sdot z26.s, z4.b, z2.b[3]\n" + "sdot z30.s, z4.b, z3.b[3]\n" + "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z27.s, z5.b, z2.b[3]\n" + "sdot z31.s, z5.b, z3.b[3]\n" + "52:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 53f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" "sdot z14.s, z3.b, z15.b\n" - "57:" // Height 4: Multiply loop: unique 8: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" + "53:" // Height 4: Multiply loop: unique 8: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 50b\n" - "tbnz %x[flags], #31, 58f\n" + "cmp x25, x19\n" + "bne 46b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "tbnz %x[flags], #31, 54f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "neg z4.s, p2/M, z4.s\n" @@ -1350,19 +1309,19 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mul z12.s, p2/M, z12.s, z4.s\n" "mul z13.s, p2/M, z13.s, z4.s\n" "mul z14.s, p2/M, z14.s, z4.s\n" - "58:" // Height 4: skip row sum fixup + "54:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z20.s, z20.s, z12.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z21.s, z21.s, z12.s\n" @@ -1389,7 +1348,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "add z26.s, z26.s, z2.s\n" "add z27.s, z27.s, z3.s\n" "add z28.s, z28.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z29.s, z29.s, z1.s\n" "add z30.s, z30.s, z2.s\n" "add z31.s, z31.s, z3.s\n" @@ -1409,7 +1368,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - "tbz %x[flags], #5, 59f\n" + "tbz %x[flags], #5, 55f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -1458,7 +1417,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "sqadd z29.s, z29.s, z10.s\n" "sqadd z30.s, z30.s, z4.s\n" "sqadd z31.s, z31.s, z5.s\n" - "59:" // Height 4: no shift correction + "55:" // Height 4: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -1489,9 +1448,9 @@ void sve_hybrid_s8qa_dot_4x4VL ( "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" + "st1b { z16.b }, p1, [x26]\n" "add z21.s, z21.s, z4.s\n" - "addvl x9, x9, #1\n" + "addvl x26, x26, #1\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" @@ -1514,61 +1473,58 @@ void sve_hybrid_s8qa_dot_4x4VL ( "uzp1 z21.h, z22.h, z23.h\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" + "st1b { z20.b }, p1, [x22]\n" "add z26.s, z26.s, z4.s\n" - "addvl x25, x25, #1\n" - "add z27.s, z27.s, z4.s\n" "smax z25.s, p2/M, z25.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" "add z28.s, z28.s, z4.s\n" "smax z26.s, p2/M, z26.s, z5.s\n" + ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" "smax z27.s, p2/M, z27.s, z5.s\n" "smin z28.s, p2/M, z28.s, z6.s\n" - ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" + "add z29.s, z29.s, z4.s\n" "uzp1 z25.h, z26.h, z27.h\n" "smax z28.s, p2/M, z28.s, z5.s\n" - "add z29.s, z29.s, z4.s\n" "add z30.s, z30.s, z4.s\n" "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x23]\n" + "st1b { z24.b }, p1, [x21]\n" "smin z29.s, p2/M, z29.s, z6.s\n" - "addvl x23, x23, #1\n" "smin z30.s, p2/M, z30.s, z6.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" "smax z29.s, p2/M, z29.s, z5.s\n" - "add z31.s, z31.s, z4.s\n" "smax z30.s, p2/M, z30.s, z5.s\n" + "add z31.s, z31.s, z4.s\n" "uzp1 z28.h, z28.h, z29.h\n" "smin z31.s, p2/M, z31.s, z6.s\n" "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z29.h, z30.h, z31.h\n" "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p1, [x21]\n" - "addvl x21, x21, #1\n" - "60:" // Height 4: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 48b\n" + "st1b { z28.b }, p1, [x20]\n" + "56:" // Height 4: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 44b\n" "subs %x[M], %x[M], #0x4\n" - "beq 62f\n" + "beq 58f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 61f\n" + "tbz %x[flags], #3, 57f\n" "add x20, x20, #0x4\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "61:" // Update direct input + "57:" // Update direct input "mov x19, #0x4\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "62:" // Exit + "58:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp index b69b561cce..61927236ad 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_s8qs_dot_6x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_s8qs_dot_6x4VL; - cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp index 495637bcdd..f901a814f9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp @@ -88,182 +88,176 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 71f\n" + "bge 66f\n" "cmp %x[M], #0x4\n" - "bgt 57f\n" - "beq 43f\n" + "bgt 53f\n" + "beq 40f\n" "cmp %x[M], #0x2\n" - "bgt 29f\n" - "beq 15f\n" - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 27f\n" + "beq 14f\n" + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "2:" // Height 1: Column loop "mov z8.s, #0x0\n" "mov x19, #0x0\n" "mov z9.s, #0x0\n" - "whilelt p1.b, x19, x15\n" + "whilelt p1.b, x19, x10\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" - "4:" // Height 1: setup done - "mov x12, #0x0\n" - "5:" // Height 1: String loop + "3:" // Height 1: setup done + "mov x27, #0x0\n" + "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 6f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 7f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 6f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "b 7f\n" - "6:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "7:" // Height 1: input setup done - "cmp x11, #0x10\n" - "ble 9f\n" - "8:" // Height 1: Multiply loop: Main loop head - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x26, #0x10\n" + "ble 8f\n" + "7:" // Height 1: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "cmp x26, #0x10\n" "sdot z10.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" + "prfm pldl1keep, [x25, #0x80]\n" "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" - "bgt 8b\n" - "9:" // Height 1: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 7b\n" + "8:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z11.s, z7.b, z0.b[0]\n" - "ble 10f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 9f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[1]\n" - "addvl x14, x14, #4\n" + "addvl x28, x28, #4\n" "sdot z11.s, z7.b, z0.b[1]\n" - "ble 10f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 9f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" - "addvl x14, x14, #4\n" + "addvl x28, x28, #4\n" "sdot z11.s, z7.b, z0.b[2]\n" - "ble 10f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 9f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" - "10:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" + "9:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 5b\n" - "ld1w { z0.s }, p2/Z, [x16]\n" + "cmp x27, x19\n" + "bne 4b\n" + "ld1w { z0.s }, p2/Z, [x11]\n" "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" "add z9.s, z9.s, z1.s\n" - "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" - "addvl x16, x16, #4\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" "add z10.s, z10.s, z2.s\n" "add z11.s, z11.s, z3.s\n" - "tbz %x[flags], #4, 11f\n" - "ld1w { z0.s }, p2/Z, [x17]\n" - "ld1w { z4.s }, p2/Z, [x8]\n" - "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" - "addvl x17, x17, #4\n" - "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" - "addvl x8, x8, #4\n" - "b 12f\n" - "11:" // Height 1: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x19]\n" + "tbz %x[flags], #4, 10f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 11f\n" + "10:" // Height 1: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" "mov z1.d, z0.d\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" "mov z2.d, z0.d\n" "mov z3.d, z0.d\n" "mov z5.d, z4.d\n" "mov z6.d, z4.d\n" "mov z7.d, z4.d\n" - "12:" // Height 1: parameters loaded + "11:" // Height 1: parameters loaded ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" - "tbz %x[flags], #5, 13f\n" + "tbz %x[flags], #5, 12f\n" "and z4.d, z8.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" @@ -276,17 +270,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sqadd z9.s, z9.s, z5.s\n" "sqadd z10.s, z10.s, z6.s\n" "sqadd z11.s, z11.s, z7.s\n" - "13:" // Height 1: no shift correction + "12:" // Height 1: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "ld1rw { z5.s }, p2/Z, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "ld1rw { z6.s }, p2/Z, [x19]\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" "add z8.s, z8.s, z4.s\n" "add z9.s, z9.s, z4.s\n" "add z10.s, z10.s, z4.s\n" @@ -302,230 +296,223 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z8.h, z8.h, z9.h\n" "uzp1 z9.h, z10.h, z11.h\n" "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x13]\n" - "addvl x13, x13, #1\n" - "14:" // Height 1: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 3b\n" - "b 86f\n" - "15:" // Height 2 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 16f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "b 17f\n" - "16:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "17:" // Height 2: Column loop + "st1b { z8.b }, p1, [x9]\n" + "addvl x9, x9, #1\n" + "13:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 80f\n" + "14:" // Height 2 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "15:" // Height 2: Column loop "mov z8.s, #0x0\n" "mov x19, #0x0\n" "mov z9.s, #0x0\n" - "whilelt p1.b, x19, x15\n" + "whilelt p1.b, x19, x10\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" "mov z13.s, #0x0\n" "mov z14.s, #0x0\n" "mov z15.s, #0x0\n" - "18:" // Height 2: setup done - "mov x12, #0x0\n" - "19:" // Height 2: String loop + "16:" // Height 2: setup done + "mov x27, #0x0\n" + "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 20f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 18f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 21f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 19f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "b 21f\n" - "20:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "21:" // Height 2: input setup done - "cmp x11, #0x10\n" - "ble 23f\n" - "22:" // Height 2: Multiply loop: Main loop head - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 19f\n" + "18:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "19:" // Height 2: input setup done + "cmp x26, #0x10\n" + "ble 21f\n" + "20:" // Height 2: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "cmp x26, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" - "bgt 22b\n" - "23:" // Height 2: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 20b\n" + "21:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" - "ble 24f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 22f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" - "ble 24f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 22f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" - "ble 24f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 22f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" - "24:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" + "22:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 19b\n" - "ld1w { z0.s }, p2/Z, [x16]\n" + "cmp x27, x19\n" + "bne 17b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z0.s }, p2/Z, [x11]\n" "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "add x23, x9, x19\n" "add z12.s, z12.s, z0.s\n" - "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" "add z9.s, z9.s, z1.s\n" - "addvl x16, x16, #4\n" + "addvl x11, x11, #4\n" "add z13.s, z13.s, z1.s\n" "add z10.s, z10.s, z2.s\n" "add z11.s, z11.s, z3.s\n" "add z14.s, z14.s, z2.s\n" "add z15.s, z15.s, z3.s\n" - "tbz %x[flags], #4, 25f\n" - "ld1w { z0.s }, p2/Z, [x17]\n" - "ld1w { z4.s }, p2/Z, [x8]\n" - "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" - "addvl x17, x17, #4\n" - "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" - "addvl x8, x8, #4\n" - "b 26f\n" - "25:" // Height 2: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x19]\n" + "tbz %x[flags], #4, 23f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 24f\n" + "23:" // Height 2: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" "mov z1.d, z0.d\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" "mov z2.d, z0.d\n" "mov z3.d, z0.d\n" "mov z5.d, z4.d\n" "mov z6.d, z4.d\n" "mov z7.d, z4.d\n" - "26:" // Height 2: parameters loaded + "24:" // Height 2: parameters loaded ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" @@ -534,7 +521,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n" ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" - "tbz %x[flags], #5, 27f\n" + "tbz %x[flags], #5, 25f\n" "and z4.d, z8.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" @@ -559,17 +546,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sqadd z13.s, z13.s, z5.s\n" "sqadd z14.s, z14.s, z6.s\n" "sqadd z15.s, z15.s, z7.s\n" - "27:" // Height 2: no shift correction + "25:" // Height 2: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "ld1rw { z5.s }, p2/Z, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "ld1rw { z6.s }, p2/Z, [x19]\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" "add z8.s, z8.s, z4.s\n" "add z9.s, z9.s, z4.s\n" @@ -590,9 +577,9 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z9.h, z10.h, z11.h\n" "smax z12.s, p2/M, z12.s, z5.s\n" "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x13]\n" + "st1b { z8.b }, p1, [x9]\n" "add z13.s, z13.s, z4.s\n" - "addvl x13, x13, #1\n" + "addvl x9, x9, #1\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" "smin z13.s, p2/M, z13.s, z6.s\n" @@ -606,37 +593,24 @@ void sve_hybrid_s8qs_dot_6x4VL ( "smax z15.s, p2/M, z15.s, z5.s\n" "uzp1 z13.h, z14.h, z15.h\n" "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "28:" // Height 2: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 17b\n" - "b 86f\n" - "29:" // Height 3 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 30f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "add x27, x27, x19\n" - "b 31f\n" - "30:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "31:" // Height 3: Column loop + "st1b { z12.b }, p1, [x23]\n" + "26:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 15b\n" + "b 80f\n" + "27:" // Height 3 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "28:" // Height 3: Column loop "mov z8.s, #0x0\n" "mov x19, #0x0\n" "mov z9.s, #0x0\n" - "whilelt p1.b, x19, x15\n" + "whilelt p1.b, x19, x10\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" @@ -647,208 +621,211 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "32:" // Height 3: setup done - "mov x12, #0x0\n" - "33:" // Height 3: String loop + "29:" // Height 3: setup done + "mov x27, #0x0\n" + "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 34f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 31f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 35f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 32f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" - "b 35f\n" - "34:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "35:" // Height 3: input setup done - "cmp x11, #0x10\n" - "ble 37f\n" - "36:" // Height 3: Multiply loop: Main loop head - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 32f\n" + "31:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "32:" // Height 3: input setup done + "cmp x26, #0x10\n" + "ble 34f\n" + "33:" // Height 3: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "cmp x26, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" - "bgt 36b\n" - "37:" // Height 3: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 33b\n" + "34:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "add x26, x26, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" - "ble 38f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 35f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" - "ble 38f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 35f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" - "ble 38f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 35f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" - "38:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "35:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 33b\n" - "ld1w { z0.s }, p2/Z, [x16]\n" + "cmp x27, x19\n" + "bne 30b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z0.s }, p2/Z, [x11]\n" "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "add x23, x9, x19\n" "add z12.s, z12.s, z0.s\n" - "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" "add z16.s, z16.s, z0.s\n" - "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" - "addvl x16, x16, #4\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "addvl x11, x11, #4\n" "add z9.s, z9.s, z1.s\n" "add z13.s, z13.s, z1.s\n" "add z10.s, z10.s, z2.s\n" @@ -858,30 +835,30 @@ void sve_hybrid_s8qs_dot_6x4VL ( "add z17.s, z17.s, z1.s\n" "add z18.s, z18.s, z2.s\n" "add z19.s, z19.s, z3.s\n" - "tbz %x[flags], #4, 39f\n" - "ld1w { z0.s }, p2/Z, [x17]\n" - "ld1w { z4.s }, p2/Z, [x8]\n" - "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" - "addvl x17, x17, #4\n" - "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" - "addvl x8, x8, #4\n" - "b 40f\n" - "39:" // Height 3: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x19]\n" + "tbz %x[flags], #4, 36f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 37f\n" + "36:" // Height 3: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" "mov z1.d, z0.d\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" "mov z2.d, z0.d\n" "mov z3.d, z0.d\n" "mov z5.d, z4.d\n" "mov z6.d, z4.d\n" "mov z7.d, z4.d\n" - "40:" // Height 3: parameters loaded + "37:" // Height 3: parameters loaded ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" @@ -894,7 +871,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n" ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" - "tbz %x[flags], #5, 41f\n" + "tbz %x[flags], #5, 38f\n" "and z4.d, z8.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" @@ -931,17 +908,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sqadd z17.s, z17.s, z5.s\n" "sqadd z18.s, z18.s, z6.s\n" "sqadd z19.s, z19.s, z7.s\n" - "41:" // Height 3: no shift correction + "38:" // Height 3: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "ld1rw { z5.s }, p2/Z, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "ld1rw { z6.s }, p2/Z, [x19]\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" "add z8.s, z8.s, z4.s\n" "add z9.s, z9.s, z4.s\n" @@ -962,9 +939,9 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z9.h, z10.h, z11.h\n" "smax z12.s, p2/M, z12.s, z5.s\n" "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x13]\n" + "st1b { z8.b }, p1, [x9]\n" "add z13.s, z13.s, z4.s\n" - "addvl x13, x13, #1\n" + "addvl x9, x9, #1\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -987,52 +964,35 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z13.h, z14.h, z15.h\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x9]\n" + "st1b { z12.b }, p1, [x23]\n" "add z18.s, z18.s, z4.s\n" - "addvl x9, x9, #1\n" - "add z19.s, z19.s, z4.s\n" "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" "uzp1 z16.h, z16.h, z17.h\n" + "smin z19.s, p2/M, z19.s, z6.s\n" "smax z18.s, p2/M, z18.s, z5.s\n" "smax z19.s, p2/M, z19.s, z5.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x27]\n" - "addvl x27, x27, #1\n" - "42:" // Height 3: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 31b\n" - "b 86f\n" - "43:" // Height 4 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 44f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19\n" - "add x25, x25, x19\n" - "b 45f\n" - "44:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "add x25, x27, x19\n" - "45:" // Height 4: Column loop + "st1b { z16.b }, p1, [x22]\n" + "39:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 28b\n" + "b 80f\n" + "40:" // Height 4 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "41:" // Height 4: Column loop "mov z8.s, #0x0\n" "mov x19, #0x0\n" "mov z9.s, #0x0\n" - "whilelt p1.b, x19, x15\n" + "whilelt p1.b, x19, x10\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" @@ -1047,123 +1007,123 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "46:" // Height 4: setup done - "mov x12, #0x0\n" - "47:" // Height 4: String loop + "42:" // Height 4: setup done + "mov x27, #0x0\n" + "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 48f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 44f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 49f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 45f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" - "b 49f\n" - "48:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "49:" // Height 4: input setup done - "cmp x11, #0x10\n" - "ble 51f\n" - "50:" // Height 4: Multiply loop: Main loop head - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 45f\n" + "44:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "45:" // Height 4: input setup done + "cmp x26, #0x10\n" + "ble 47f\n" + "46:" // Height 4: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "prfm pldl1keep, [x22, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z22.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z22.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1172,31 +1132,31 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" - "bgt 50b\n" - "51:" // Height 4: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 46b\n" + "47:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "add x24, x24, #0x10\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "sdot z17.s, z7.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -1205,21 +1165,21 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" - "ble 52f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 48f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" @@ -1228,21 +1188,21 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" - "ble 52f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 48f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" @@ -1251,20 +1211,20 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" - "ble 52f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 48f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1273,24 +1233,28 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" - "52:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "48:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 47b\n" - "ld1w { z0.s }, p2/Z, [x16]\n" + "cmp x27, x19\n" + "bne 43b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z0.s }, p2/Z, [x11]\n" "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "add x23, x9, x19\n" "add z12.s, z12.s, z0.s\n" - "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" "add z16.s, z16.s, z0.s\n" - "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" - "addvl x16, x16, #4\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "add x21, x22, x19\n" "add z9.s, z9.s, z1.s\n" + "addvl x11, x11, #4\n" "add z13.s, z13.s, z1.s\n" "add z10.s, z10.s, z2.s\n" "add z11.s, z11.s, z3.s\n" @@ -1303,30 +1267,30 @@ void sve_hybrid_s8qs_dot_6x4VL ( "add z21.s, z21.s, z1.s\n" "add z22.s, z22.s, z2.s\n" "add z23.s, z23.s, z3.s\n" - "tbz %x[flags], #4, 53f\n" - "ld1w { z0.s }, p2/Z, [x17]\n" - "ld1w { z4.s }, p2/Z, [x8]\n" - "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" - "addvl x17, x17, #4\n" - "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" - "addvl x8, x8, #4\n" - "b 54f\n" - "53:" // Height 4: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x19]\n" + "tbz %x[flags], #4, 49f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 50f\n" + "49:" // Height 4: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" "mov z1.d, z0.d\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" "mov z2.d, z0.d\n" "mov z3.d, z0.d\n" "mov z5.d, z4.d\n" "mov z6.d, z4.d\n" "mov z7.d, z4.d\n" - "54:" // Height 4: parameters loaded + "50:" // Height 4: parameters loaded ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" @@ -1343,7 +1307,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n" ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" - "tbz %x[flags], #5, 55f\n" + "tbz %x[flags], #5, 51f\n" "and z4.d, z8.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" @@ -1392,17 +1356,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sqadd z21.s, z21.s, z5.s\n" "sqadd z22.s, z22.s, z6.s\n" "sqadd z23.s, z23.s, z7.s\n" - "55:" // Height 4: no shift correction + "51:" // Height 4: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "ld1rw { z5.s }, p2/Z, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "ld1rw { z6.s }, p2/Z, [x19]\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" "add z8.s, z8.s, z4.s\n" "add z9.s, z9.s, z4.s\n" @@ -1423,9 +1387,9 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z9.h, z10.h, z11.h\n" "smax z12.s, p2/M, z12.s, z5.s\n" "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x13]\n" + "st1b { z8.b }, p1, [x9]\n" "add z13.s, z13.s, z4.s\n" - "addvl x13, x13, #1\n" + "addvl x9, x9, #1\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -1448,76 +1412,55 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z13.h, z14.h, z15.h\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x9]\n" + "st1b { z12.b }, p1, [x23]\n" "add z18.s, z18.s, z4.s\n" - "addvl x9, x9, #1\n" - "add z19.s, z19.s, z4.s\n" "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" "uzp1 z16.h, z16.h, z17.h\n" + "smin z19.s, p2/M, z19.s, z6.s\n" "add z20.s, z20.s, z4.s\n" "smax z18.s, p2/M, z18.s, z5.s\n" + ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" "smax z19.s, p2/M, z19.s, z5.s\n" "smin z20.s, p2/M, z20.s, z6.s\n" - ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" + "add z21.s, z21.s, z4.s\n" "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" - "add z21.s, z21.s, z4.s\n" "add z22.s, z22.s, z4.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x27]\n" + "st1b { z16.b }, p1, [x22]\n" "smin z21.s, p2/M, z21.s, z6.s\n" - "addvl x27, x27, #1\n" "smin z22.s, p2/M, z22.s, z6.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" "smax z21.s, p2/M, z21.s, z5.s\n" - "add z23.s, z23.s, z4.s\n" "smax z22.s, p2/M, z22.s, z5.s\n" + "add z23.s, z23.s, z4.s\n" "uzp1 z20.h, z20.h, z21.h\n" "smin z23.s, p2/M, z23.s, z6.s\n" "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z21.h, z22.h, z23.h\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" - "addvl x25, x25, #1\n" - "56:" // Height 4: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 45b\n" - "b 86f\n" - "57:" // Height 5 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 58f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "b 59f\n" - "58:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "add x25, x27, x19\n" - "add x23, x25, x19\n" - "59:" // Height 5: Column loop + "st1b { z20.b }, p1, [x21]\n" + "52:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 41b\n" + "b 80f\n" + "53:" // Height 5 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "54:" // Height 5: Column loop "mov z8.s, #0x0\n" "mov x19, #0x0\n" "mov z9.s, #0x0\n" - "whilelt p1.b, x19, x15\n" + "whilelt p1.b, x19, x10\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" @@ -1536,143 +1479,143 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "60:" // Height 5: setup done - "mov x12, #0x0\n" - "61:" // Height 5: String loop + "55:" // Height 5: setup done + "mov x27, #0x0\n" + "56:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 62f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 57f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 63f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 58f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 63f\n" - "62:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "63:" // Height 5: input setup done - "cmp x11, #0x10\n" - "ble 65f\n" - "64:" // Height 5: Multiply loop: Main loop head - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x21, x21, x19\n" + "b 58f\n" + "57:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "58:" // Height 5: input setup done + "cmp x26, #0x10\n" + "ble 60f\n" + "59:" // Height 5: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" "sdot z26.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z22.s, z6.b, z3.b[1]\n" "sdot z26.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z22.s, z6.b, z3.b[2]\n" "sdot z26.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1683,35 +1626,35 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" - "bgt 64b\n" - "65:" // Height 5: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 59b\n" + "60:" // Height 5: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "add x21, x21, #0x10\n" "sdot z17.s, z7.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -1722,23 +1665,23 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" - "ble 66f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 61f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" @@ -1749,23 +1692,23 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" - "ble 66f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 61f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" @@ -1776,22 +1719,22 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" - "ble 66f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 61f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1802,26 +1745,31 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" - "66:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "61:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 61b\n" - "ld1w { z0.s }, p2/Z, [x16]\n" + "cmp x27, x19\n" + "bne 56b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z0.s }, p2/Z, [x11]\n" "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "add x23, x9, x19\n" "add z12.s, z12.s, z0.s\n" - "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" "add z16.s, z16.s, z0.s\n" - "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" - "addvl x16, x16, #4\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "add x21, x22, x19\n" "add z9.s, z9.s, z1.s\n" + "add x20, x21, x19\n" "add z13.s, z13.s, z1.s\n" + "addvl x11, x11, #4\n" "add z10.s, z10.s, z2.s\n" "add z11.s, z11.s, z3.s\n" "add z14.s, z14.s, z2.s\n" @@ -1837,30 +1785,30 @@ void sve_hybrid_s8qs_dot_6x4VL ( "add z25.s, z25.s, z1.s\n" "add z26.s, z26.s, z2.s\n" "add z27.s, z27.s, z3.s\n" - "tbz %x[flags], #4, 67f\n" - "ld1w { z0.s }, p2/Z, [x17]\n" - "ld1w { z4.s }, p2/Z, [x8]\n" - "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" - "addvl x17, x17, #4\n" - "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" - "addvl x8, x8, #4\n" - "b 68f\n" - "67:" // Height 5: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x19]\n" + "tbz %x[flags], #4, 62f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 63f\n" + "62:" // Height 5: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" "mov z1.d, z0.d\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" "mov z2.d, z0.d\n" "mov z3.d, z0.d\n" "mov z5.d, z4.d\n" "mov z6.d, z4.d\n" "mov z7.d, z4.d\n" - "68:" // Height 5: parameters loaded + "63:" // Height 5: parameters loaded ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" @@ -1881,7 +1829,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n" ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" - "tbz %x[flags], #5, 69f\n" + "tbz %x[flags], #5, 64f\n" "and z4.d, z8.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" @@ -1942,17 +1890,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sqadd z25.s, z25.s, z5.s\n" "sqadd z26.s, z26.s, z6.s\n" "sqadd z27.s, z27.s, z7.s\n" - "69:" // Height 5: no shift correction + "64:" // Height 5: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "ld1rw { z5.s }, p2/Z, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "ld1rw { z6.s }, p2/Z, [x19]\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" "add z8.s, z8.s, z4.s\n" "add z9.s, z9.s, z4.s\n" @@ -1973,9 +1921,9 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z9.h, z10.h, z11.h\n" "smax z12.s, p2/M, z12.s, z5.s\n" "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x13]\n" + "st1b { z8.b }, p1, [x9]\n" "add z13.s, z13.s, z4.s\n" - "addvl x13, x13, #1\n" + "addvl x9, x9, #1\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -1998,29 +1946,27 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z13.h, z14.h, z15.h\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x9]\n" + "st1b { z12.b }, p1, [x23]\n" "add z18.s, z18.s, z4.s\n" - "addvl x9, x9, #1\n" - "add z19.s, z19.s, z4.s\n" "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" "uzp1 z16.h, z16.h, z17.h\n" + "smin z19.s, p2/M, z19.s, z6.s\n" "add z20.s, z20.s, z4.s\n" "smax z18.s, p2/M, z18.s, z5.s\n" + ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" "smax z19.s, p2/M, z19.s, z5.s\n" "smin z20.s, p2/M, z20.s, z6.s\n" - ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" + "add z21.s, z21.s, z4.s\n" "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" - "add z21.s, z21.s, z4.s\n" "add z22.s, z22.s, z4.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x27]\n" + "st1b { z16.b }, p1, [x22]\n" "smin z21.s, p2/M, z21.s, z6.s\n" - "addvl x27, x27, #1\n" "smin z22.s, p2/M, z22.s, z6.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" @@ -2043,57 +1989,35 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" + "st1b { z20.b }, p1, [x21]\n" "add z27.s, z27.s, z4.s\n" - "addvl x25, x25, #1\n" "smin z26.s, p2/M, z26.s, z6.s\n" "smin z27.s, p2/M, z27.s, z6.s\n" "smax z26.s, p2/M, z26.s, z5.s\n" "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z25.h, z26.h, z27.h\n" "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x23]\n" - "addvl x23, x23, #1\n" - "70:" // Height 5: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 59b\n" - "b 86f\n" - "71:" // Height 6 - "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" - "mov x16, %x[col_bias]\n" - "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "st1b { z24.b }, p1, [x20]\n" + "65:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 54b\n" + "b 80f\n" + "66:" // Height 6 + "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n" + "mov x11, %x[col_bias]\n" + "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n" + "mov x9, %x[output_ptr]\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x20, #0x6\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 72f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "add x21, x21, x19\n" - "b 73f\n" - "72:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19\n" - "add x27, x9, x19\n" - "add x25, x27, x19\n" - "add x23, x25, x19\n" - "add x21, x23, x19\n" - "add %x[output_ptr], x21, x19\n" - "73:" // Height 6: Column loop + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "67:" // Height 6: Column loop "mov z8.s, #0x0\n" "mov x19, #0x0\n" "mov z9.s, #0x0\n" - "whilelt p1.b, x19, x15\n" + "whilelt p1.b, x19, x10\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" "mov z12.s, #0x0\n" @@ -2116,77 +2040,77 @@ void sve_hybrid_s8qs_dot_6x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "74:" // Height 6: setup done - "mov x12, #0x0\n" - "75:" // Height 6: String loop + "68:" // Height 6: setup done + "mov x27, #0x0\n" + "69:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 76f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 70f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 77f\n" + "cbnz x27, 71f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 77f\n" - "76:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "77:" // Height 6: input setup done - "cmp x11, #0x10\n" - "ble 79f\n" - "78:" // Height 6: Multiply loop: Main loop head - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "b 71f\n" + "70:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "71:" // Height 6: input setup done + "cmp x26, #0x10\n" + "ble 73f\n" + "72:" // Height 6: Multiply loop: Main loop head + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "add x20, x20, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sdot z25.s, z7.b, z4.b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" @@ -2194,85 +2118,85 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z22.s, z6.b, z3.b[0]\n" "sdot z26.s, z6.b, z4.b[0]\n" "sdot z30.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" "sdot z31.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z22.s, z6.b, z3.b[1]\n" "sdot z26.s, z6.b, z4.b[1]\n" "sdot z30.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" "sdot z31.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z22.s, z6.b, z3.b[2]\n" "sdot z26.s, z6.b, z4.b[2]\n" "sdot z30.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" "sdot z31.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -2285,39 +2209,39 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" "sdot z31.s, z7.b, z5.b[3]\n" - "bgt 78b\n" - "79:" // Height 6: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 72b\n" + "73:" // Height 6: Multiply loop: Single iteration only + "ld1b { z6.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" "add x20, x20, #0x10\n" "sdot z17.s, z7.b, z2.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -2330,25 +2254,25 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" "sdot z31.s, z7.b, z5.b[0]\n" - "ble 80f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 74f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" @@ -2361,25 +2285,25 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" "sdot z31.s, z7.b, z5.b[1]\n" - "ble 80f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 74f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" @@ -2392,24 +2316,24 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" "sdot z31.s, z7.b, z5.b[2]\n" - "ble 80f\n" - "ld1b { z6.b }, p2/Z, [x14]\n" + "ble 74f\n" + "ld1b { z6.b }, p2/Z, [x28]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -2422,28 +2346,34 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" "sdot z31.s, z7.b, z5.b[3]\n" - "80:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "74:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 75b\n" - "ld1w { z0.s }, p2/Z, [x16]\n" + "cmp x27, x19\n" + "bne 69b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z0.s }, p2/Z, [x11]\n" "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n" + "add x23, x9, x19\n" "add z12.s, z12.s, z0.s\n" - "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n" + "add x22, x23, x19\n" "add z16.s, z16.s, z0.s\n" - "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n" - "addvl x16, x16, #4\n" + "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n" + "add x21, x22, x19\n" "add z9.s, z9.s, z1.s\n" + "add x20, x21, x19\n" "add z13.s, z13.s, z1.s\n" + "add x19, x20, x19\n" "add z10.s, z10.s, z2.s\n" + "addvl x11, x11, #4\n" "add z11.s, z11.s, z3.s\n" "add z14.s, z14.s, z2.s\n" "add z15.s, z15.s, z3.s\n" @@ -2462,30 +2392,30 @@ void sve_hybrid_s8qs_dot_6x4VL ( "add z29.s, z29.s, z1.s\n" "add z30.s, z30.s, z2.s\n" "add z31.s, z31.s, z3.s\n" - "tbz %x[flags], #4, 81f\n" - "ld1w { z0.s }, p2/Z, [x17]\n" - "ld1w { z4.s }, p2/Z, [x8]\n" - "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n" - "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n" - "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n" - "addvl x17, x17, #4\n" - "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n" - "addvl x8, x8, #4\n" - "b 82f\n" - "81:" // Height 6: per layer parameters - "add x19, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x19]\n" + "tbz %x[flags], #4, 75f\n" + "ld1w { z0.s }, p2/Z, [x12]\n" + "ld1w { z4.s }, p2/Z, [x13]\n" + "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n" + "addvl x12, x12, #4\n" + "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n" + "addvl x13, x13, #4\n" + "b 76f\n" + "75:" // Height 6: per layer parameters + "add x24, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x24]\n" "mov z1.d, z0.d\n" - "add x19, %x[qp], %[per_layer_mul]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[per_layer_mul]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" "mov z2.d, z0.d\n" "mov z3.d, z0.d\n" "mov z5.d, z4.d\n" "mov z6.d, z4.d\n" "mov z7.d, z4.d\n" - "82:" // Height 6: parameters loaded + "76:" // Height 6: parameters loaded ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n" ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n" ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" @@ -2510,7 +2440,7 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n" ".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n" ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n" - "tbz %x[flags], #5, 83f\n" + "tbz %x[flags], #5, 77f\n" "and z4.d, z8.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z9.d, z1.d\n" @@ -2583,17 +2513,17 @@ void sve_hybrid_s8qs_dot_6x4VL ( "sqadd z29.s, z29.s, z5.s\n" "sqadd z30.s, z30.s, z6.s\n" "sqadd z31.s, z31.s, z7.s\n" - "83:" // Height 6: no shift correction + "77:" // Height 6: no shift correction ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add x19, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x19]\n" + "add x24, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x24]\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" - "add x19, %x[qp], %[minval]\n" + "add x24, %x[qp], %[minval]\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "ld1rw { z5.s }, p2/Z, [x19]\n" - "add x19, %x[qp], %[maxval]\n" + "ld1rw { z5.s }, p2/Z, [x24]\n" + "add x24, %x[qp], %[maxval]\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "ld1rw { z6.s }, p2/Z, [x19]\n" + "ld1rw { z6.s }, p2/Z, [x24]\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" "add z8.s, z8.s, z4.s\n" "add z9.s, z9.s, z4.s\n" @@ -2614,9 +2544,9 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z9.h, z10.h, z11.h\n" "smax z12.s, p2/M, z12.s, z5.s\n" "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x13]\n" + "st1b { z8.b }, p1, [x9]\n" "add z13.s, z13.s, z4.s\n" - "addvl x13, x13, #1\n" + "addvl x9, x9, #1\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" @@ -2639,29 +2569,27 @@ void sve_hybrid_s8qs_dot_6x4VL ( "uzp1 z13.h, z14.h, z15.h\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x9]\n" + "st1b { z12.b }, p1, [x23]\n" "add z18.s, z18.s, z4.s\n" - "addvl x9, x9, #1\n" - "add z19.s, z19.s, z4.s\n" "smax z17.s, p2/M, z17.s, z5.s\n" + "add z19.s, z19.s, z4.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" "uzp1 z16.h, z16.h, z17.h\n" + "smin z19.s, p2/M, z19.s, z6.s\n" "add z20.s, z20.s, z4.s\n" "smax z18.s, p2/M, z18.s, z5.s\n" + ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" "smax z19.s, p2/M, z19.s, z5.s\n" "smin z20.s, p2/M, z20.s, z6.s\n" - ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" + "add z21.s, z21.s, z4.s\n" "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" - "add z21.s, z21.s, z4.s\n" "add z22.s, z22.s, z4.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x27]\n" + "st1b { z16.b }, p1, [x22]\n" "smin z21.s, p2/M, z21.s, z6.s\n" - "addvl x27, x27, #1\n" "smin z22.s, p2/M, z22.s, z6.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" @@ -2684,58 +2612,55 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" "uzp1 z24.h, z24.h, z25.h\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" + "st1b { z20.b }, p1, [x21]\n" "add z27.s, z27.s, z4.s\n" - "addvl x25, x25, #1\n" "smin z26.s, p2/M, z26.s, z6.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" ".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n" "smin z27.s, p2/M, z27.s, z6.s\n" - ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n" + "smax z26.s, p2/M, z26.s, z5.s\n" "add z28.s, z28.s, z4.s\n" "add z29.s, z29.s, z4.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" - "add z30.s, z30.s, z4.s\n" "smax z27.s, p2/M, z27.s, z5.s\n" "smin z28.s, p2/M, z28.s, z6.s\n" "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" + ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n" "uzp1 z25.h, z26.h, z27.h\n" "smax z28.s, p2/M, z28.s, z5.s\n" "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x23]\n" + "st1b { z24.b }, p1, [x20]\n" + "add z30.s, z30.s, z4.s\n" "smax z29.s, p2/M, z29.s, z5.s\n" - "addvl x23, x23, #1\n" - "smax z30.s, p2/M, z30.s, z5.s\n" ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n" + "smin z30.s, p2/M, z30.s, z6.s\n" "uzp1 z28.h, z28.h, z29.h\n" "add z31.s, z31.s, z4.s\n" + "smax z30.s, p2/M, z30.s, z5.s\n" "smin z31.s, p2/M, z31.s, z6.s\n" "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z29.h, z30.h, z31.h\n" "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p1, [x21]\n" - "addvl x21, x21, #1\n" - "84:" // Height 6: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 73b\n" + "st1b { z28.b }, p1, [x19]\n" + "78:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 67b\n" "subs %x[M], %x[M], #0x6\n" - "beq 86f\n" + "beq 80f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 85f\n" + "tbz %x[flags], #3, 79f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "85:" // Update direct input + "79:" // Update direct input "mov x19, #0x6\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "86:" // Exit + "80:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp index a6652fd1b2..b2c376196f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_s8s32_dot_6x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_s8s32_dot_6x4VL; - cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp index b2ebce80d2..8862b3665a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp @@ -80,197 +80,184 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ptrue p5.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 61f\n" + "bge 56f\n" "cmp %x[M], #0x4\n" - "bgt 49f\n" - "beq 37f\n" + "bgt 45f\n" + "beq 34f\n" "cmp %x[M], #0x2\n" - "bgt 25f\n" - "beq 13f\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 23f\n" + "beq 12f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 4f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "b 5f\n" - "4:" // Height 1: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 3f\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "b 4f\n" + "3:" // Height 1: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" - "5:" // Height 1: setup done - "mov x12, #0x0\n" - "6:" // Height 1: String loop + "4:" // Height 1: setup done + "mov x27, #0x0\n" + "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 7f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 8f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 7f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "b 8f\n" - "7:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "8:" // Height 1: input setup done - "cmp x11, #0x10\n" - "ble 10f\n" - "9:" // Height 1: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x26, #0x10\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "cmp x26, #0x10\n" "sdot z10.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" + "prfm pldl1keep, [x25, #0x80]\n" "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" - "bgt 9b\n" - "10:" // Height 1: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z11.s, z7.b, z0.b[0]\n" - "ble 11f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 10f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[1]\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "sdot z11.s, z7.b, z0.b[1]\n" - "ble 11f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 10f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "sdot z11.s, z7.b, z0.b[2]\n" - "ble 11f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 10f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" - "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" + "10:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 6b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "12:" // Height 1: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 3b\n" - "b 74f\n" - "13:" // Height 2 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 14f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 15f\n" - "14:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "15:" // Height 2: Column loop + "cmp x27, x19\n" + "bne 5b\n" + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "11:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 68f\n" + "12:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "13:" // Height 2: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 16f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "b 17f\n" - "16:" // Height 2: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 14f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 15f\n" + "14:" // Height 2: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -279,214 +266,206 @@ void sve_hybrid_s8s32_dot_6x4VL ( "mov z13.s, #0x0\n" "mov z14.s, #0x0\n" "mov z15.s, #0x0\n" - "17:" // Height 2: setup done - "mov x12, #0x0\n" - "18:" // Height 2: String loop + "15:" // Height 2: setup done + "mov x27, #0x0\n" + "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 20f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 18f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "b 20f\n" - "19:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "20:" // Height 2: input setup done - "cmp x11, #0x10\n" - "ble 22f\n" - "21:" // Height 2: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 18f\n" + "17:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "18:" // Height 2: input setup done + "cmp x26, #0x10\n" + "ble 20f\n" + "19:" // Height 2: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "cmp x26, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" - "bgt 21b\n" - "22:" // Height 2: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 19b\n" + "20:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" - "ble 23f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 21f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" - "ble 23f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 21f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" - "ble 23f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 21f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" - "23:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" + "21:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 18b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "24:" // Height 2: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 15b\n" - "b 74f\n" - "25:" // Height 3 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 16b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 26f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 27f\n" - "26:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "27:" // Height 3: Column loop + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x23]\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "22:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 13b\n" + "b 68f\n" + "23:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "24:" // Height 3: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 28f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "b 29f\n" - "28:" // Height 3: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 25f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "b 26f\n" + "25:" // Height 3: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -499,267 +478,257 @@ void sve_hybrid_s8s32_dot_6x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "29:" // Height 3: setup done - "mov x12, #0x0\n" - "30:" // Height 3: String loop + "26:" // Height 3: setup done + "mov x27, #0x0\n" + "27:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 31f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 28f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 32f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 29f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" - "b 32f\n" - "31:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "32:" // Height 3: input setup done - "cmp x11, #0x10\n" - "ble 34f\n" - "33:" // Height 3: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 29f\n" + "28:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "29:" // Height 3: input setup done + "cmp x26, #0x10\n" + "ble 31f\n" + "30:" // Height 3: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "cmp x26, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" - "bgt 33b\n" - "34:" // Height 3: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 30b\n" + "31:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "add x26, x26, #0x10\n" + "add x23, x23, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" - "ble 35f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 32f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" - "ble 35f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 32f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" - "ble 35f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 32f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" "sdot z11.s, z7.b, z0.b[3]\n" "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" - "35:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "32:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 30b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "36:" // Height 3: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 27b\n" - "b 74f\n" - "37:" // Height 4 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 27b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 38f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 39f\n" - "38:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "39:" // Height 4: Column loop + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x23]\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "33:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 24b\n" + "b 68f\n" + "34:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "35:" // Height 4: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 40f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "b 41f\n" - "40:" // Height 4: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 36f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 37f\n" + "36:" // Height 4: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -776,123 +745,123 @@ void sve_hybrid_s8s32_dot_6x4VL ( "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "41:" // Height 4: setup done - "mov x12, #0x0\n" - "42:" // Height 4: String loop + "37:" // Height 4: setup done + "mov x27, #0x0\n" + "38:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 43f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 39f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 44f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 40f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" - "b 44f\n" - "43:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "44:" // Height 4: input setup done - "cmp x11, #0x10\n" - "ble 46f\n" - "45:" // Height 4: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 40f\n" + "39:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "40:" // Height 4: input setup done + "cmp x26, #0x10\n" + "ble 42f\n" + "41:" // Height 4: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "prfm pldl1keep, [x22, #0x80]\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z22.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z22.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -901,31 +870,31 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" - "bgt 45b\n" - "46:" // Height 4: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 41b\n" + "42:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "add x24, x24, #0x10\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "sdot z17.s, z7.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -934,21 +903,21 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" - "ble 47f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 43f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" @@ -957,21 +926,21 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" - "ble 47f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 43f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" @@ -980,20 +949,20 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" - "ble 47f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 43f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1002,94 +971,82 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z15.s, z7.b, z1.b[3]\n" "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" - "47:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "43:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 42b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "48:" // Height 4: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 39b\n" - "b 74f\n" - "49:" // Height 5 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 38b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 50f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 51f\n" - "50:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "51:" // Height 5: Column loop + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z12.s }, p4, [x23]\n" + "addvl x28, x28, #4\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "44:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 35b\n" + "b 68f\n" + "45:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "46:" // Height 5: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 52f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "b 53f\n" - "52:" // Height 5: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" + "b 48f\n" + "47:" // Height 5: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -1110,143 +1067,143 @@ void sve_hybrid_s8s32_dot_6x4VL ( "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "53:" // Height 5: setup done - "mov x12, #0x0\n" - "54:" // Height 5: String loop + "48:" // Height 5: setup done + "mov x27, #0x0\n" + "49:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 55f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 56f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 51f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 56f\n" - "55:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "56:" // Height 5: input setup done - "cmp x11, #0x10\n" - "ble 58f\n" - "57:" // Height 5: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x21, x21, x19\n" + "b 51f\n" + "50:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "51:" // Height 5: input setup done + "cmp x26, #0x10\n" + "ble 53f\n" + "52:" // Height 5: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" "sdot z22.s, z6.b, z3.b[0]\n" "sdot z26.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z22.s, z6.b, z3.b[1]\n" "sdot z26.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z22.s, z6.b, z3.b[2]\n" "sdot z26.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1257,35 +1214,35 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" - "bgt 57b\n" - "58:" // Height 5: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 52b\n" + "53:" // Height 5: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "sdot z13.s, z7.b, z1.b[0]\n" + "add x21, x21, #0x10\n" "sdot z17.s, z7.b, z2.b[0]\n" "sdot z20.s, z6.b, z3.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -1296,23 +1253,23 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" - "ble 59f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 54f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" @@ -1323,23 +1280,23 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" - "ble 59f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 54f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" @@ -1350,22 +1307,22 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" - "ble 59f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 54f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1376,109 +1333,96 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z19.s, z7.b, z2.b[3]\n" "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" - "59:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "54:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 54b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "60:" // Height 5: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 51b\n" - "b 74f\n" - "61:" // Height 6 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 49b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z12.s }, p4, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "55:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 46b\n" + "b 68f\n" + "56:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 62f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 63f\n" - "62:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "63:" // Height 6: Column loop + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "57:" // Height 6: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 64f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" - "b 65f\n" - "64:" // Height 6: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x19]\n" + "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n" + "b 59f\n" + "58:" // Height 6: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -1503,77 +1447,77 @@ void sve_hybrid_s8s32_dot_6x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "65:" // Height 6: setup done - "mov x12, #0x0\n" - "66:" // Height 6: String loop + "59:" // Height 6: setup done + "mov x27, #0x0\n" + "60:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 67f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 61f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 68f\n" + "cbnz x27, 62f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 68f\n" - "67:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "68:" // Height 6: input setup done - "cmp x11, #0x10\n" - "ble 70f\n" - "69:" // Height 6: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "b 62f\n" + "61:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "62:" // Height 6: input setup done + "cmp x26, #0x10\n" + "ble 64f\n" + "63:" // Height 6: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "add x20, x20, #0x10\n" "sdot z24.s, z6.b, z4.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "sdot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sdot z25.s, z7.b, z4.b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "sdot z25.s, z7.b, z4.b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "sdot z10.s, z6.b, z0.b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "sdot z14.s, z6.b, z1.b[0]\n" @@ -1581,85 +1525,85 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z22.s, z6.b, z3.b[0]\n" "sdot z26.s, z6.b, z4.b[0]\n" "sdot z30.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[0]\n" "sdot z15.s, z7.b, z1.b[0]\n" "sdot z19.s, z7.b, z2.b[0]\n" "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" "sdot z31.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "sdot z8.s, z6.b, z0.b[1]\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" "sdot z22.s, z6.b, z3.b[1]\n" "sdot z26.s, z6.b, z4.b[1]\n" "sdot z30.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "sdot z11.s, z7.b, z0.b[1]\n" "sdot z15.s, z7.b, z1.b[1]\n" "sdot z19.s, z7.b, z2.b[1]\n" "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" "sdot z31.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "sdot z8.s, z6.b, z0.b[2]\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z22.s, z6.b, z3.b[2]\n" "sdot z26.s, z6.b, z4.b[2]\n" "sdot z30.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "sdot z11.s, z7.b, z0.b[2]\n" "sdot z15.s, z7.b, z1.b[2]\n" "sdot z19.s, z7.b, z2.b[2]\n" "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" "sdot z31.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "sdot z8.s, z6.b, z0.b[3]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1672,39 +1616,39 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" "sdot z31.s, z7.b, z5.b[3]\n" - "bgt 69b\n" - "70:" // Height 6: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 63b\n" + "64:" // Height 6: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sdot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "sdot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "sdot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "sdot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "sdot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "sdot z20.s, z6.b, z3.b[0]\n" "add x20, x20, #0x10\n" "sdot z17.s, z7.b, z2.b[0]\n" "sdot z24.s, z6.b, z4.b[0]\n" "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z21.s, z7.b, z3.b[0]\n" "sdot z25.s, z7.b, z4.b[0]\n" "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[0]\n" "sdot z14.s, z6.b, z1.b[0]\n" "sdot z18.s, z6.b, z2.b[0]\n" @@ -1717,25 +1661,25 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[0]\n" "sdot z27.s, z7.b, z4.b[0]\n" "sdot z31.s, z7.b, z5.b[0]\n" - "ble 71f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 65f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[1]\n" "sdot z16.s, z6.b, z2.b[1]\n" "sdot z20.s, z6.b, z3.b[1]\n" "sdot z24.s, z6.b, z4.b[1]\n" "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[1]\n" "sdot z13.s, z7.b, z1.b[1]\n" "sdot z17.s, z7.b, z2.b[1]\n" "sdot z21.s, z7.b, z3.b[1]\n" "sdot z25.s, z7.b, z4.b[1]\n" "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[1]\n" "sdot z14.s, z6.b, z1.b[1]\n" "sdot z18.s, z6.b, z2.b[1]\n" @@ -1748,25 +1692,25 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[1]\n" "sdot z27.s, z7.b, z4.b[1]\n" "sdot z31.s, z7.b, z5.b[1]\n" - "ble 71f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 65f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "sdot z12.s, z6.b, z1.b[2]\n" "sdot z16.s, z6.b, z2.b[2]\n" "sdot z20.s, z6.b, z3.b[2]\n" "sdot z24.s, z6.b, z4.b[2]\n" "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[2]\n" "sdot z13.s, z7.b, z1.b[2]\n" "sdot z17.s, z7.b, z2.b[2]\n" "sdot z21.s, z7.b, z3.b[2]\n" "sdot z25.s, z7.b, z4.b[2]\n" "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[2]\n" "sdot z14.s, z6.b, z1.b[2]\n" "sdot z18.s, z6.b, z2.b[2]\n" @@ -1779,24 +1723,24 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[2]\n" "sdot z27.s, z7.b, z4.b[2]\n" "sdot z31.s, z7.b, z5.b[2]\n" - "ble 71f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 65f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "sdot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "sdot z12.s, z6.b, z1.b[3]\n" "sdot z16.s, z6.b, z2.b[3]\n" "sdot z20.s, z6.b, z3.b[3]\n" "sdot z24.s, z6.b, z4.b[3]\n" "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b[3]\n" "sdot z13.s, z7.b, z1.b[3]\n" "sdot z17.s, z7.b, z2.b[3]\n" "sdot z21.s, z7.b, z3.b[3]\n" "sdot z25.s, z7.b, z4.b[3]\n" "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "sdot z10.s, z6.b, z0.b[3]\n" "sdot z14.s, z6.b, z1.b[3]\n" "sdot z18.s, z6.b, z2.b[3]\n" @@ -1809,67 +1753,68 @@ void sve_hybrid_s8s32_dot_6x4VL ( "sdot z23.s, z7.b, z3.b[3]\n" "sdot z27.s, z7.b, z4.b[3]\n" "sdot z31.s, z7.b, z5.b[3]\n" - "71:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "65:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 66b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "st1w { z28.s }, p4, [x21]\n" - "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" - "addvl x21, x21, #4\n" - "72:" // Height 6: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 63b\n" + "cmp x27, x19\n" + "bne 60b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z12.s }, p4, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "st1w { z28.s }, p4, [x19]\n" + "st1w { z29.s }, p3, [x19, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x19, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x19, #3, MUL VL]\n" + "66:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 57b\n" "subs %x[M], %x[M], #0x6\n" - "beq 74f\n" + "beq 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 73f\n" + "tbz %x[flags], #3, 67f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "73:" // Update direct input + "67:" // Update direct input "mov x19, #0x6\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "74:" // Exit + "68:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp index bacf2351ac..cfb8adfc87 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_u8qa_dot_4x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_u8qa_dot_4x4VL; - cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp index 52210dca27..373d82930b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -81,152 +81,143 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ptrue p2.b\n" "1:" // Row loop "cmp %x[M], #0x4\n" - "bge 46f\n" + "bge 43f\n" "cmp %x[M], #0x2\n" - "bgt 31f\n" - "beq 16f\n" + "bgt 29f\n" + "beq 15f\n" "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x10, %x[col_bias]\n" - "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "bic %x[flags], %x[flags], #0x80000000\n" - "mov z14.s, #0x0\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 2f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "add x9, x9, x19\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x9, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x27, %x[col_bias]\n" + "bic %x[flags], %x[flags], #0x80000000\n" + "mov x26, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "4:" // Height 1: setup done - "mov x28, #0x0\n" - "5:" // Height 1: String loop + "3:" // Height 1: setup done + "mov x25, #0x0\n" + "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 6f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 5f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "cbnz x28, 7f\n" + "ldr x23, [x20, #0x0]\n" + "cbnz x25, 6f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "b 7f\n" - "6:" // Height 1: setup direct input - "mov x26, %x[input_ptr]\n" - "7:" // Height 1: input setup done - "cmp x27, #0x10\n" - "ble 10f\n" - "8:" // Height 1: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "add x23, x23, x19\n" + "b 6f\n" + "5:" // Height 1: setup direct input + "mov x23, %x[input_ptr]\n" + "6:" // Height 1: input setup done + "cmp x24, #0x10\n" + "ble 9f\n" + "7:" // Height 1: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x26, x26, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "udot z18.s, z6.b, z0.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "udot z19.s, z7.b, z0.b[0]\n" "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "udot z17.s, z9.b, z0.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "udot z18.s, z10.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "udot z19.s, z4.b, z0.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "udot z16.s, z5.b, z0.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "udot z17.s, z6.b, z0.b[2]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "udot z18.s, z7.b, z0.b[2]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "udot z19.s, z8.b, z0.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "udot z16.s, z9.b, z0.b[3]\n" "udot z17.s, z10.b, z0.b[3]\n" "udot z18.s, z4.b, z0.b[3]\n" "udot z19.s, z5.b, z0.b[3]\n" - "tbnz %x[flags], #31, 9f\n" + "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" - "9:" // Height 1: Multiply loop: unique 1: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "cmp x27, #0x10\n" - "bgt 8b\n" - "10:" // Height 1: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "udot z16.s, z6.b, z0.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x26, x26, #0x10\n" - "udot z17.s, z7.b, z0.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z18.s, z8.b, z0.b[0]\n" - "udot z19.s, z9.b, z0.b[0]\n" - "ble 11f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "udot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z17.s, z4.b, z0.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "udot z18.s, z5.b, z0.b[1]\n" - "addvl x11, x11, #4\n" - "udot z19.s, z6.b, z0.b[1]\n" - "ble 11f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "udot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z17.s, z8.b, z0.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "udot z18.s, z9.b, z0.b[2]\n" - "addvl x11, x11, #4\n" - "udot z19.s, z10.b, z0.b[2]\n" - "ble 11f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "udot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z17.s, z5.b, z0.b[3]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z18.s, z6.b, z0.b[3]\n" - "udot z19.s, z7.b, z0.b[3]\n" - "11:" // Height 1: Multiply loop: multiply skip - "tbnz %x[flags], #31, 12f\n" + "8:" // Height 1: Multiply loop: unique 1: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "cmp x24, #0x10\n" + "bgt 7b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x23, x23, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "ble 10f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z17.s, z9.b, z0.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "addvl x28, x28, #4\n" + "udot z19.s, z4.b, z0.b[1]\n" + "ble 10f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z17.s, z6.b, z0.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "addvl x28, x28, #4\n" + "udot z19.s, z8.b, z0.b[2]\n" + "ble 10f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "10:" // Height 1: Multiply loop: multiply skip + "tbnz %x[flags], #31, 11f\n" "udot z11.s, z0.b, z15.b\n" - "12:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" + "11:" // Height 1: Multiply loop: unique 2: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 5b\n" - "tbnz %x[flags], #31, 13f\n" + "cmp x25, x19\n" + "bne 4b\n" + "tbnz %x[flags], #31, 12f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z1.s }, p2/Z, [x19]\n" "neg z1.s, p2/M, z1.s\n" @@ -235,21 +226,21 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uaddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "mul z11.s, p2/M, z11.s, z1.s\n" - "13:" // Height 1: skip row sum fixup + "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z16.s, z16.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z17.s, z17.s, z1.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z18.s, z18.s, z2.s\n" @@ -258,7 +249,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - "tbz %x[flags], #5, 14f\n" + "tbz %x[flags], #5, 13f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -271,7 +262,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "sqadd z17.s, z17.s, z5.s\n" "sqadd z18.s, z18.s, z6.s\n" "sqadd z19.s, z19.s, z7.s\n" - "14:" // Height 1: no shift correction + "13:" // Height 1: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -297,107 +288,96 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uzp1 z16.h, z16.h, z17.h\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" - "addvl x9, x9, #1\n" - "15:" // Height 1: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 3b\n" - "b 62f\n" - "16:" // Height 2 + "st1b { z16.b }, p1, [x26]\n" + "addvl x26, x26, #1\n" + "14:" // Height 1: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 2b\n" + "b 58f\n" + "15:" // Height 2 "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" - "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "mov z14.s, #0x0\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 17f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "add x25, x25, x19\n" - "b 18f\n" - "17:" // Height 2: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "18:" // Height 2: Column loop + "mov x26, %x[output_ptr]\n" + "16:" // Height 2: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "19:" // Height 2: setup done - "mov x28, #0x0\n" - "20:" // Height 2: String loop + "17:" // Height 2: setup done + "mov x25, #0x0\n" + "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 21f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 19f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "cbnz x28, 22f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "cbnz x25, 20f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" - "b 22f\n" - "21:" // Height 2: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "22:" // Height 2: input setup done - "cmp x27, #0x10\n" - "ble 25f\n" - "23:" // Height 2: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 20f\n" + "19:" // Height 2: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "20:" // Height 2: input setup done + "cmp x24, #0x10\n" + "ble 23f\n" + "21:" // Height 2: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x24, x24, #0x10\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x22, #0x10\n" "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "udot z21.s, z5.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "udot z18.s, z6.b, z0.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" "udot z22.s, z6.b, z1.b[0]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "udot z19.s, z7.b, z0.b[0]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "udot z23.s, z7.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "udot z20.s, z8.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "udot z17.s, z9.b, z0.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "udot z21.s, z9.b, z1.b[1]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "udot z18.s, z10.b, z0.b[1]\n" "udot z22.s, z10.b, z1.b[1]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "udot z19.s, z4.b, z0.b[1]\n" "udot z23.s, z4.b, z1.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "udot z16.s, z5.b, z0.b[2]\n" "udot z20.s, z5.b, z1.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "udot z17.s, z6.b, z0.b[2]\n" "udot z21.s, z6.b, z1.b[2]\n" "udot z18.s, z7.b, z0.b[2]\n" @@ -412,91 +392,93 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z22.s, z4.b, z1.b[3]\n" "udot z19.s, z5.b, z0.b[3]\n" "udot z23.s, z5.b, z1.b[3]\n" - "tbnz %x[flags], #31, 24f\n" + "tbnz %x[flags], #31, 22f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" - "24:" // Height 2: Multiply loop: unique 3: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x10\n" - "bgt 23b\n" - "25:" // Height 2: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "udot z16.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" - "udot z17.s, z7.b, z0.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" - "add x24, x24, #0x10\n" - "udot z20.s, z6.b, z1.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z21.s, z7.b, z1.b[0]\n" - "udot z18.s, z8.b, z0.b[0]\n" - "udot z22.s, z8.b, z1.b[0]\n" - "udot z19.s, z9.b, z0.b[0]\n" - "udot z23.s, z9.b, z1.b[0]\n" - "ble 26f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "udot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z20.s, z10.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "udot z17.s, z4.b, z0.b[1]\n" - "addvl x11, x11, #4\n" - "udot z21.s, z4.b, z1.b[1]\n" - "udot z18.s, z5.b, z0.b[1]\n" - "udot z22.s, z5.b, z1.b[1]\n" - "udot z19.s, z6.b, z0.b[1]\n" - "udot z23.s, z6.b, z1.b[1]\n" - "ble 26f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "udot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z20.s, z7.b, z1.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "udot z17.s, z8.b, z0.b[2]\n" - "addvl x11, x11, #4\n" - "udot z21.s, z8.b, z1.b[2]\n" - "udot z18.s, z9.b, z0.b[2]\n" - "udot z22.s, z9.b, z1.b[2]\n" - "udot z19.s, z10.b, z0.b[2]\n" - "udot z23.s, z10.b, z1.b[2]\n" - "ble 26f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "udot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "udot z20.s, z4.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "udot z17.s, z5.b, z0.b[3]\n" - "addvl x11, x11, #4\n" - "udot z21.s, z5.b, z1.b[3]\n" - "udot z18.s, z6.b, z0.b[3]\n" - "udot z22.s, z6.b, z1.b[3]\n" - "udot z19.s, z7.b, z0.b[3]\n" - "udot z23.s, z7.b, z1.b[3]\n" - "26:" // Height 2: Multiply loop: multiply skip - "tbnz %x[flags], #31, 27f\n" + "22:" // Height 2: Multiply loop: unique 3: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" + "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x10\n" + "bgt 21b\n" + "23:" // Height 2: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x22, #0x10\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "ble 24f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z20.s, z8.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "addvl x28, x28, #4\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "ble 24f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z20.s, z5.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "addvl x28, x28, #4\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "ble 24f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "addvl x28, x28, #4\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "24:" // Height 2: Multiply loop: multiply skip + "tbnz %x[flags], #31, 25f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" - "27:" // Height 2: Multiply loop: unique 4: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" + "25:" // Height 2: Multiply loop: unique 4: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 20b\n" - "tbnz %x[flags], #31, 28f\n" + "cmp x25, x19\n" + "bne 18b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19\n" + "tbnz %x[flags], #31, 26f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z2.s }, p2/Z, [x19]\n" "neg z2.s, p2/M, z2.s\n" @@ -508,19 +490,19 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z12.s, z12.s[0]\n" "mul z11.s, p2/M, z11.s, z2.s\n" "mul z12.s, p2/M, z12.s, z2.s\n" - "28:" // Height 2: skip row sum fixup + "26:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z20.s, z20.s, z12.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z21.s, z21.s, z12.s\n" @@ -531,7 +513,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "add z18.s, z18.s, z2.s\n" "add z19.s, z19.s, z3.s\n" "add z20.s, z20.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z21.s, z21.s, z1.s\n" "add z22.s, z22.s, z2.s\n" "add z23.s, z23.s, z3.s\n" @@ -543,7 +525,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - "tbz %x[flags], #5, 29f\n" + "tbz %x[flags], #5, 27f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -568,7 +550,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "sqadd z21.s, z21.s, z9.s\n" "sqadd z22.s, z22.s, z10.s\n" "sqadd z23.s, z23.s, z4.s\n" - "29:" // Height 2: no shift correction + "27:" // Height 2: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -599,9 +581,9 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" + "st1b { z16.b }, p1, [x26]\n" "add z21.s, z21.s, z4.s\n" - "addvl x9, x9, #1\n" + "addvl x26, x26, #1\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" "smin z21.s, p2/M, z21.s, z6.s\n" @@ -615,41 +597,27 @@ void sve_hybrid_u8qa_dot_4x4VL ( "smax z23.s, p2/M, z23.s, z5.s\n" "uzp1 z21.h, z22.h, z23.h\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" - "addvl x25, x25, #1\n" - "30:" // Height 2: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 18b\n" - "b 62f\n" - "31:" // Height 3 + "st1b { z20.b }, p1, [x22]\n" + "28:" // Height 2: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 16b\n" + "b 58f\n" + "29:" // Height 3 "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "mov z14.s, #0x0\n" + "mov x26, %x[output_ptr]\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 32f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "add x25, x25, x19\n" - "add x23, x23, x19\n" - "b 33f\n" - "32:" // Height 3: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "33:" // Height 3: Column loop + "30:" // Height 3: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" @@ -660,83 +628,83 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "34:" // Height 3: setup done - "mov x28, #0x0\n" - "35:" // Height 3: String loop + "31:" // Height 3: setup done + "mov x25, #0x0\n" + "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 36f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 33f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" - "cbnz x28, 37f\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" + "cbnz x25, 34f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 37f\n" - "36:" // Height 3: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "37:" // Height 3: input setup done - "cmp x27, #0x10\n" - "ble 40f\n" - "38:" // Height 3: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "add x21, x21, x19\n" + "b 34f\n" + "33:" // Height 3: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "34:" // Height 3: input setup done + "cmp x24, #0x10\n" + "ble 37f\n" + "35:" // Height 3: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x21, x21, #0x10\n" "udot z24.s, z4.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "udot z21.s, z5.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "udot z25.s, z5.b, z2.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" "udot z18.s, z6.b, z0.b[0]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "udot z22.s, z6.b, z1.b[0]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "udot z26.s, z6.b, z2.b[0]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" "udot z19.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "udot z23.s, z7.b, z1.b[0]\n" "udot z27.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "udot z16.s, z8.b, z0.b[1]\n" "udot z20.s, z8.b, z1.b[1]\n" "udot z24.s, z8.b, z2.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "udot z17.s, z9.b, z0.b[1]\n" "udot z21.s, z9.b, z1.b[1]\n" "udot z25.s, z9.b, z2.b[1]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "udot z18.s, z10.b, z0.b[1]\n" "udot z22.s, z10.b, z1.b[1]\n" "udot z26.s, z10.b, z2.b[1]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "udot z19.s, z4.b, z0.b[1]\n" "udot z23.s, z4.b, z1.b[1]\n" "udot z27.s, z4.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "udot z16.s, z5.b, z0.b[2]\n" "udot z20.s, z5.b, z1.b[2]\n" "udot z24.s, z5.b, z2.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "udot z17.s, z6.b, z0.b[2]\n" "udot z21.s, z6.b, z1.b[2]\n" "udot z25.s, z6.b, z2.b[2]\n" @@ -758,113 +726,116 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z19.s, z5.b, z0.b[3]\n" "udot z23.s, z5.b, z1.b[3]\n" "udot z27.s, z5.b, z2.b[3]\n" - "tbnz %x[flags], #31, 39f\n" + "tbnz %x[flags], #31, 36f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" - "39:" // Height 3: Multiply loop: unique 5: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x10\n" + "36:" // Height 3: Multiply loop: unique 5: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" "prfm pldl1keep, [x22, #0x80]\n" - "bgt 38b\n" - "40:" // Height 3: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "udot z16.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" - "udot z17.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "udot z20.s, z6.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "cmp x24, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" + "bgt 35b\n" + "37:" // Height 3: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" - "udot z24.s, z6.b, z2.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z21.s, z7.b, z1.b[0]\n" - "udot z25.s, z7.b, z2.b[0]\n" - "udot z18.s, z8.b, z0.b[0]\n" - "udot z22.s, z8.b, z1.b[0]\n" - "udot z26.s, z8.b, z2.b[0]\n" - "udot z19.s, z9.b, z0.b[0]\n" - "udot z23.s, z9.b, z1.b[0]\n" - "udot z27.s, z9.b, z2.b[0]\n" - "ble 41f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "udot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z20.s, z10.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z24.s, z10.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z17.s, z4.b, z0.b[1]\n" - "udot z21.s, z4.b, z1.b[1]\n" - "udot z25.s, z4.b, z2.b[1]\n" - "udot z18.s, z5.b, z0.b[1]\n" - "udot z22.s, z5.b, z1.b[1]\n" - "udot z26.s, z5.b, z2.b[1]\n" - "udot z19.s, z6.b, z0.b[1]\n" - "udot z23.s, z6.b, z1.b[1]\n" - "udot z27.s, z6.b, z2.b[1]\n" - "ble 41f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "udot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z20.s, z7.b, z1.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z24.s, z7.b, z2.b[2]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z17.s, z8.b, z0.b[2]\n" - "udot z21.s, z8.b, z1.b[2]\n" - "udot z25.s, z8.b, z2.b[2]\n" - "udot z18.s, z9.b, z0.b[2]\n" - "udot z22.s, z9.b, z1.b[2]\n" - "udot z26.s, z9.b, z2.b[2]\n" - "udot z19.s, z10.b, z0.b[2]\n" - "udot z23.s, z10.b, z1.b[2]\n" - "udot z27.s, z10.b, z2.b[2]\n" - "ble 41f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "udot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "udot z20.s, z4.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z24.s, z4.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z17.s, z5.b, z0.b[3]\n" - "udot z21.s, z5.b, z1.b[3]\n" - "udot z25.s, z5.b, z2.b[3]\n" - "udot z18.s, z6.b, z0.b[3]\n" - "udot z22.s, z6.b, z1.b[3]\n" - "udot z26.s, z6.b, z2.b[3]\n" - "udot z19.s, z7.b, z0.b[3]\n" - "udot z23.s, z7.b, z1.b[3]\n" - "udot z27.s, z7.b, z2.b[3]\n" - "41:" // Height 3: Multiply loop: multiply skip - "tbnz %x[flags], #31, 42f\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" + "add x21, x21, #0x10\n" + "udot z24.s, z4.b, z2.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z21.s, z5.b, z1.b[0]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "ble 38f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z20.s, z8.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "ble 38f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z20.s, z5.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "ble 38f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "38:" // Height 3: Multiply loop: multiply skip + "tbnz %x[flags], #31, 39f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" - "42:" // Height 3: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" + "39:" // Height 3: Multiply loop: unique 6: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 35b\n" - "tbnz %x[flags], #31, 43f\n" + "cmp x25, x19\n" + "bne 32b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19\n" + "add x21, x22, x19\n" + "tbnz %x[flags], #31, 40f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z3.s }, p2/Z, [x19]\n" "neg z3.s, p2/M, z3.s\n" @@ -879,19 +850,19 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mul z11.s, p2/M, z11.s, z3.s\n" "mul z12.s, p2/M, z12.s, z3.s\n" "mul z13.s, p2/M, z13.s, z3.s\n" - "43:" // Height 3: skip row sum fixup + "40:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z20.s, z20.s, z12.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z21.s, z21.s, z12.s\n" @@ -910,7 +881,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "add z22.s, z22.s, z2.s\n" "add z23.s, z23.s, z3.s\n" "add z24.s, z24.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z25.s, z25.s, z1.s\n" "add z26.s, z26.s, z2.s\n" "add z27.s, z27.s, z3.s\n" @@ -926,7 +897,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" - "tbz %x[flags], #5, 44f\n" + "tbz %x[flags], #5, 41f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -963,7 +934,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "asr z8.s, z8.s, #0x1f\n" "sqadd z26.s, z26.s, z7.s\n" "sqadd z27.s, z27.s, z8.s\n" - "44:" // Height 3: no shift correction + "41:" // Height 3: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -994,9 +965,9 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" + "st1b { z16.b }, p1, [x26]\n" "add z21.s, z21.s, z4.s\n" - "addvl x9, x9, #1\n" + "addvl x26, x26, #1\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" @@ -1019,58 +990,42 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uzp1 z21.h, z22.h, z23.h\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" + "st1b { z20.b }, p1, [x22]\n" "add z26.s, z26.s, z4.s\n" - "addvl x25, x25, #1\n" - "add z27.s, z27.s, z4.s\n" "smax z25.s, p2/M, z25.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" "smax z26.s, p2/M, z26.s, z5.s\n" "smax z27.s, p2/M, z27.s, z5.s\n" "uzp1 z25.h, z26.h, z27.h\n" "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x23]\n" - "addvl x23, x23, #1\n" - "45:" // Height 3: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 33b\n" - "b 62f\n" - "46:" // Height 4 + "st1b { z24.b }, p1, [x21]\n" + "42:" // Height 3: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 30b\n" + "b 58f\n" + "43:" // Height 4 "mov z11.s, #0x0\n" - "ldr x12, [%x[args_ptr], %[offsetof_N]]\n" - "mov x10, %x[col_bias]\n" + "ldr x9, [%x[args_ptr], %[offsetof_N]]\n" + "mov x27, %x[col_bias]\n" "mov z12.s, #0x0\n" - "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n" "bic %x[flags], %x[flags], #0x80000000\n" "mov z13.s, #0x0\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, %x[output_ptr]\n" "mov z14.s, #0x0\n" + "mov x19, #0x4\n" "mov z15.b, #0x1\n" - "tbz %x[flags], #2, 47f\n" - "ldr x9, [%x[output_ptr], #0x0]\n" - "ldr x25, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19\n" - "ldr x23, [%x[output_ptr], #0x10]\n" - "ldr x21, [%x[output_ptr], #0x18]\n" - "add x25, x25, x19\n" - "add %x[output_ptr], %x[output_ptr], #0x20\n" - "add x23, x23, x19\n" - "add x21, x21, x19\n" - "b 48f\n" - "47:" // Height 4: setup direct output - "mov x9, %x[output_ptr]\n" - "add x25, x9, x19\n" - "add x23, x25, x19\n" - "add x21, x23, x19\n" - "add %x[output_ptr], x21, x19\n" - "48:" // Height 4: Column loop + "madd %x[output_ptr], x20, x19, %x[output_ptr]\n" + "44:" // Height 4: Column loop "mov z16.s, #0x0\n" "mov x19, #0x0\n" "mov z17.s, #0x0\n" - "whilelt p1.b, x19, x12\n" + "whilelt p1.b, x19, x9\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" "mov z20.s, #0x0\n" @@ -1085,97 +1040,97 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "49:" // Height 4: setup done - "mov x28, #0x0\n" - "50:" // Height 4: String loop + "45:" // Height 4: setup done + "mov x25, #0x0\n" + "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 51f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "ldr w24, [x20, x25, LSL #0x2]\n" + "tbz %x[flags], #3, 47f\n" + "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x24, [x20, #0x8]\n" - "ldr x22, [x20, #0x10]\n" + "ldr x23, [x20, #0x0]\n" + "ldr x22, [x20, #0x8]\n" + "ldr x21, [x20, #0x10]\n" "ldr x20, [x20, #0x18]\n" - "cbnz x28, 52f\n" + "cbnz x25, 48f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x19\n" - "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 52f\n" - "51:" // Height 4: setup direct input - "mov x26, %x[input_ptr]\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "52:" // Height 4: input setup done - "cmp x27, #0x10\n" - "ble 55f\n" - "53:" // Height 4: Multiply loop: Main loop head - "ld1b { z4.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "b 48f\n" + "47:" // Height 4: setup direct input + "mov x23, %x[input_ptr]\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "48:" // Height 4: input setup done + "cmp x24, #0x10\n" + "ble 51f\n" + "49:" // Height 4: Multiply loop: Main loop head + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "udot z16.s, z4.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z17.s, z5.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "udot z20.s, z4.b, z1.b[0]\n" "ld1rqb { z3.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "udot z24.s, z4.b, z2.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "add x20, x20, #0x10\n" "udot z21.s, z5.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" "udot z25.s, z5.b, z2.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" "udot z28.s, z4.b, z3.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" "udot z29.s, z5.b, z3.b[0]\n" - "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" "udot z18.s, z6.b, z0.b[0]\n" - "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n" - "addvl x11, x11, #16\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "addvl x28, x28, #16\n" "udot z22.s, z6.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" "udot z26.s, z6.b, z2.b[0]\n" "udot z30.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" "udot z19.s, z7.b, z0.b[0]\n" "udot z23.s, z7.b, z1.b[0]\n" "udot z27.s, z7.b, z2.b[0]\n" "udot z31.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" "udot z16.s, z8.b, z0.b[1]\n" "udot z20.s, z8.b, z1.b[1]\n" "udot z24.s, z8.b, z2.b[1]\n" "udot z28.s, z8.b, z3.b[1]\n" - "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" "udot z17.s, z9.b, z0.b[1]\n" "udot z21.s, z9.b, z1.b[1]\n" "udot z25.s, z9.b, z2.b[1]\n" "udot z29.s, z9.b, z3.b[1]\n" - "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" "udot z18.s, z10.b, z0.b[1]\n" "udot z22.s, z10.b, z1.b[1]\n" "udot z26.s, z10.b, z2.b[1]\n" "udot z30.s, z10.b, z3.b[1]\n" - "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n" + "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" "udot z19.s, z4.b, z0.b[1]\n" "udot z23.s, z4.b, z1.b[1]\n" "udot z27.s, z4.b, z2.b[1]\n" "udot z31.s, z4.b, z3.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" "udot z16.s, z5.b, z0.b[2]\n" "udot z20.s, z5.b, z1.b[2]\n" "udot z24.s, z5.b, z2.b[2]\n" "udot z28.s, z5.b, z3.b[2]\n" - "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" "udot z17.s, z6.b, z0.b[2]\n" "udot z21.s, z6.b, z1.b[2]\n" "udot z25.s, z6.b, z2.b[2]\n" @@ -1204,135 +1159,139 @@ void sve_hybrid_u8qa_dot_4x4VL ( "udot z23.s, z5.b, z1.b[3]\n" "udot z27.s, z5.b, z2.b[3]\n" "udot z31.s, z5.b, z3.b[3]\n" - "tbnz %x[flags], #31, 54f\n" + "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" - "54:" // Height 4: Multiply loop: unique 7: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "cmp x27, #0x10\n" + "50:" // Height 4: Multiply loop: unique 7: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "sub x24, x24, #0x10\n" "prfm pldl1keep, [x22, #0x80]\n" + "cmp x24, #0x10\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" - "bgt 53b\n" - "55:" // Height 4: Multiply loop: Single iteration only - "ld1b { z6.b }, p2/Z, [x11]\n" - "whilelt p0.b, XZR, x27\n" - "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "udot z16.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" - "udot z17.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "udot z20.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x20]\n" + "bgt 49b\n" + "51:" // Height 4: Multiply loop: Single iteration only + "ld1b { z4.b }, p2/Z, [x28]\n" + "whilelt p0.b, XZR, x24\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" + "udot z16.s, z4.b, z0.b[0]\n" + "ld1rqb { z1.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" + "udot z17.s, z5.b, z0.b[0]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" - "udot z24.s, z6.b, z2.b[0]\n" - "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n" + "udot z20.s, z4.b, z1.b[0]\n" + "ld1rqb { z3.b }, p0/Z, [x20]\n" + "add x21, x21, #0x10\n" + "udot z24.s, z4.b, z2.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" "add x20, x20, #0x10\n" - "udot z21.s, z7.b, z1.b[0]\n" - "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z28.s, z6.b, z3.b[0]\n" - "udot z25.s, z7.b, z2.b[0]\n" - "udot z29.s, z7.b, z3.b[0]\n" - "udot z18.s, z8.b, z0.b[0]\n" - "udot z22.s, z8.b, z1.b[0]\n" - "udot z26.s, z8.b, z2.b[0]\n" - "udot z30.s, z8.b, z3.b[0]\n" - "udot z19.s, z9.b, z0.b[0]\n" - "udot z23.s, z9.b, z1.b[0]\n" - "udot z27.s, z9.b, z2.b[0]\n" - "udot z31.s, z9.b, z3.b[0]\n" - "ble 56f\n" - "ld1b { z10.b }, p2/Z, [x11]\n" - "udot z16.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z20.s, z10.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z24.s, z10.b, z2.b[1]\n" - "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z28.s, z10.b, z3.b[1]\n" - "udot z17.s, z4.b, z0.b[1]\n" - "udot z21.s, z4.b, z1.b[1]\n" - "udot z25.s, z4.b, z2.b[1]\n" - "udot z29.s, z4.b, z3.b[1]\n" - "udot z18.s, z5.b, z0.b[1]\n" - "udot z22.s, z5.b, z1.b[1]\n" - "udot z26.s, z5.b, z2.b[1]\n" - "udot z30.s, z5.b, z3.b[1]\n" - "udot z19.s, z6.b, z0.b[1]\n" - "udot z23.s, z6.b, z1.b[1]\n" - "udot z27.s, z6.b, z2.b[1]\n" - "udot z31.s, z6.b, z3.b[1]\n" - "ble 56f\n" - "ld1b { z7.b }, p2/Z, [x11]\n" - "udot z16.s, z7.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n" - "subs x27, x27, #0x4\n" - "udot z20.s, z7.b, z1.b[2]\n" - "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z24.s, z7.b, z2.b[2]\n" - "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z28.s, z7.b, z3.b[2]\n" - "udot z17.s, z8.b, z0.b[2]\n" - "udot z21.s, z8.b, z1.b[2]\n" - "udot z25.s, z8.b, z2.b[2]\n" - "udot z29.s, z8.b, z3.b[2]\n" - "udot z18.s, z9.b, z0.b[2]\n" - "udot z22.s, z9.b, z1.b[2]\n" - "udot z26.s, z9.b, z2.b[2]\n" - "udot z30.s, z9.b, z3.b[2]\n" - "udot z19.s, z10.b, z0.b[2]\n" - "udot z23.s, z10.b, z1.b[2]\n" - "udot z27.s, z10.b, z2.b[2]\n" - "udot z31.s, z10.b, z3.b[2]\n" - "ble 56f\n" - "ld1b { z4.b }, p2/Z, [x11]\n" - "udot z16.s, z4.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n" - "udot z20.s, z4.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n" - "udot z24.s, z4.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n" - "addvl x11, x11, #4\n" - "udot z28.s, z4.b, z3.b[3]\n" - "udot z17.s, z5.b, z0.b[3]\n" - "udot z21.s, z5.b, z1.b[3]\n" - "udot z25.s, z5.b, z2.b[3]\n" - "udot z29.s, z5.b, z3.b[3]\n" - "udot z18.s, z6.b, z0.b[3]\n" - "udot z22.s, z6.b, z1.b[3]\n" - "udot z26.s, z6.b, z2.b[3]\n" - "udot z30.s, z6.b, z3.b[3]\n" - "udot z19.s, z7.b, z0.b[3]\n" - "udot z23.s, z7.b, z1.b[3]\n" - "udot z27.s, z7.b, z2.b[3]\n" - "udot z31.s, z7.b, z3.b[3]\n" - "56:" // Height 4: Multiply loop: multiply skip - "tbnz %x[flags], #31, 57f\n" + "udot z21.s, z5.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z28.s, z4.b, z3.b[0]\n" + "udot z25.s, z5.b, z2.b[0]\n" + "udot z29.s, z5.b, z3.b[0]\n" + "udot z18.s, z6.b, z0.b[0]\n" + "udot z22.s, z6.b, z1.b[0]\n" + "udot z26.s, z6.b, z2.b[0]\n" + "udot z30.s, z6.b, z3.b[0]\n" + "udot z19.s, z7.b, z0.b[0]\n" + "udot z23.s, z7.b, z1.b[0]\n" + "udot z27.s, z7.b, z2.b[0]\n" + "udot z31.s, z7.b, z3.b[0]\n" + "ble 52f\n" + "ld1b { z8.b }, p2/Z, [x28]\n" + "udot z16.s, z8.b, z0.b[1]\n" + "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z20.s, z8.b, z1.b[1]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z8.b, z2.b[1]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z28.s, z8.b, z3.b[1]\n" + "udot z17.s, z9.b, z0.b[1]\n" + "udot z21.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z2.b[1]\n" + "udot z29.s, z9.b, z3.b[1]\n" + "udot z18.s, z10.b, z0.b[1]\n" + "udot z22.s, z10.b, z1.b[1]\n" + "udot z26.s, z10.b, z2.b[1]\n" + "udot z30.s, z10.b, z3.b[1]\n" + "udot z19.s, z4.b, z0.b[1]\n" + "udot z23.s, z4.b, z1.b[1]\n" + "udot z27.s, z4.b, z2.b[1]\n" + "udot z31.s, z4.b, z3.b[1]\n" + "ble 52f\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "udot z16.s, z5.b, z0.b[2]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "subs x24, x24, #0x4\n" + "udot z20.s, z5.b, z1.b[2]\n" + "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z5.b, z2.b[2]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z28.s, z5.b, z3.b[2]\n" + "udot z17.s, z6.b, z0.b[2]\n" + "udot z21.s, z6.b, z1.b[2]\n" + "udot z25.s, z6.b, z2.b[2]\n" + "udot z29.s, z6.b, z3.b[2]\n" + "udot z18.s, z7.b, z0.b[2]\n" + "udot z22.s, z7.b, z1.b[2]\n" + "udot z26.s, z7.b, z2.b[2]\n" + "udot z30.s, z7.b, z3.b[2]\n" + "udot z19.s, z8.b, z0.b[2]\n" + "udot z23.s, z8.b, z1.b[2]\n" + "udot z27.s, z8.b, z2.b[2]\n" + "udot z31.s, z8.b, z3.b[2]\n" + "ble 52f\n" + "ld1b { z9.b }, p2/Z, [x28]\n" + "udot z16.s, z9.b, z0.b[3]\n" + "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z20.s, z9.b, z1.b[3]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z9.b, z2.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "udot z28.s, z9.b, z3.b[3]\n" + "udot z17.s, z10.b, z0.b[3]\n" + "udot z21.s, z10.b, z1.b[3]\n" + "udot z25.s, z10.b, z2.b[3]\n" + "udot z29.s, z10.b, z3.b[3]\n" + "udot z18.s, z4.b, z0.b[3]\n" + "udot z22.s, z4.b, z1.b[3]\n" + "udot z26.s, z4.b, z2.b[3]\n" + "udot z30.s, z4.b, z3.b[3]\n" + "udot z19.s, z5.b, z0.b[3]\n" + "udot z23.s, z5.b, z1.b[3]\n" + "udot z27.s, z5.b, z2.b[3]\n" + "udot z31.s, z5.b, z3.b[3]\n" + "52:" // Height 4: Multiply loop: multiply skip + "tbnz %x[flags], #31, 53f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" "udot z14.s, z3.b, z15.b\n" - "57:" // Height 4: Multiply loop: unique 8: skip row sum - "prfm pldl1keep, [x26, #0x80]\n" - "add x28, x28, #0x1\n" - "prfm pldl1keep, [x24, #0x80]\n" + "53:" // Height 4: Multiply loop: unique 8: skip row sum + "prfm pldl1keep, [x23, #0x80]\n" + "add x25, x25, #0x1\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x28, x19\n" - "bne 50b\n" - "tbnz %x[flags], #31, 58f\n" + "cmp x25, x19\n" + "bne 46b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x26, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "tbnz %x[flags], #31, 54f\n" "add x19, %x[qp], %[b_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "neg z4.s, p2/M, z4.s\n" @@ -1350,19 +1309,19 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mul z12.s, p2/M, z12.s, z4.s\n" "mul z13.s, p2/M, z13.s, z4.s\n" "mul z14.s, p2/M, z14.s, z4.s\n" - "58:" // Height 4: skip row sum fixup + "54:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x27]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" - "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n" + "add x23, %x[qp], %[per_layer_right_shift]\n" "add z18.s, z18.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n" "add x19, %x[qp], %[per_layer_mul]\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "addvl x10, x10, #4\n" + "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n" + "addvl x27, x27, #4\n" "add z20.s, z20.s, z12.s\n" "ld1rw { z4.s }, p2/Z, [x19]\n" "add z21.s, z21.s, z12.s\n" @@ -1389,7 +1348,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "add z26.s, z26.s, z2.s\n" "add z27.s, z27.s, z3.s\n" "add z28.s, z28.s, z0.s\n" - "ld1rw { z0.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x23]\n" "add z29.s, z29.s, z1.s\n" "add z30.s, z30.s, z2.s\n" "add z31.s, z31.s, z3.s\n" @@ -1409,7 +1368,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - "tbz %x[flags], #5, 59f\n" + "tbz %x[flags], #5, 55f\n" "and z4.d, z16.d, z0.d\n" "asr z4.s, z4.s, #0x1f\n" "and z5.d, z17.d, z0.d\n" @@ -1458,7 +1417,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "sqadd z29.s, z29.s, z10.s\n" "sqadd z30.s, z30.s, z4.s\n" "sqadd z31.s, z31.s, z5.s\n" - "59:" // Height 4: no shift correction + "55:" // Height 4: no shift correction ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" "add x19, %x[qp], %[c_offset]\n" "ld1rw { z4.s }, p2/Z, [x19]\n" @@ -1489,9 +1448,9 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uzp1 z17.h, z18.h, z19.h\n" "smax z20.s, p2/M, z20.s, z5.s\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x9]\n" + "st1b { z16.b }, p1, [x26]\n" "add z21.s, z21.s, z4.s\n" - "addvl x9, x9, #1\n" + "addvl x26, x26, #1\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" @@ -1514,61 +1473,58 @@ void sve_hybrid_u8qa_dot_4x4VL ( "uzp1 z21.h, z22.h, z23.h\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x25]\n" + "st1b { z20.b }, p1, [x22]\n" "add z26.s, z26.s, z4.s\n" - "addvl x25, x25, #1\n" - "add z27.s, z27.s, z4.s\n" "smax z25.s, p2/M, z25.s, z5.s\n" + "add z27.s, z27.s, z4.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" "uzp1 z24.h, z24.h, z25.h\n" + "smin z27.s, p2/M, z27.s, z6.s\n" "add z28.s, z28.s, z4.s\n" "smax z26.s, p2/M, z26.s, z5.s\n" + ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" "smax z27.s, p2/M, z27.s, z5.s\n" "smin z28.s, p2/M, z28.s, z6.s\n" - ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" + "add z29.s, z29.s, z4.s\n" "uzp1 z25.h, z26.h, z27.h\n" "smax z28.s, p2/M, z28.s, z5.s\n" - "add z29.s, z29.s, z4.s\n" "add z30.s, z30.s, z4.s\n" "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x23]\n" + "st1b { z24.b }, p1, [x21]\n" "smin z29.s, p2/M, z29.s, z6.s\n" - "addvl x23, x23, #1\n" "smin z30.s, p2/M, z30.s, z6.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" "smax z29.s, p2/M, z29.s, z5.s\n" - "add z31.s, z31.s, z4.s\n" "smax z30.s, p2/M, z30.s, z5.s\n" + "add z31.s, z31.s, z4.s\n" "uzp1 z28.h, z28.h, z29.h\n" "smin z31.s, p2/M, z31.s, z6.s\n" "smax z31.s, p2/M, z31.s, z5.s\n" "uzp1 z29.h, z30.h, z31.h\n" "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p1, [x21]\n" - "addvl x21, x21, #1\n" - "60:" // Height 4: Writeback done - "decw x12, ALL, MUL #4\n" - "cmp x12, XZR\n" - "bgt 48b\n" + "st1b { z28.b }, p1, [x20]\n" + "56:" // Height 4: Writeback done + "decw x9, ALL, MUL #4\n" + "cmp x9, XZR\n" + "bgt 44b\n" "subs %x[M], %x[M], #0x4\n" - "beq 62f\n" + "beq 58f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 61f\n" + "tbz %x[flags], #3, 57f\n" "add x20, x20, #0x4\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "61:" // Update direct input + "57:" // Update direct input "mov x19, #0x4\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "62:" // Exit + "58:" // Exit : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp index 8433fa605e..4ea1d17c4e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp @@ -36,7 +36,6 @@ namespace arm_gemm { - // Actual kernel implementations void sve_hybrid_u8u32_dot_6x4VL( ARGLIST ); @@ -73,7 +72,6 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_u8u32_dot_6x4VL; - cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *) { } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp index 6ee636d7f6..97f6665d85 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp @@ -80,197 +80,184 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ptrue p5.b\n" "1:" // Row loop "cmp %x[M], #0x6\n" - "bge 61f\n" + "bge 56f\n" "cmp %x[M], #0x4\n" - "bgt 49f\n" - "beq 37f\n" + "bgt 45f\n" + "beq 34f\n" "cmp %x[M], #0x2\n" - "bgt 25f\n" - "beq 13f\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 2f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "b 3f\n" - "2:" // Height 1: setup direct output - "mov x13, %x[output_ptr]\n" - "3:" // Height 1: Column loop + "bgt 23f\n" + "beq 12f\n" + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x28, %x[output_ptr]\n" + "2:" // Height 1: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 4f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "b 5f\n" - "4:" // Height 1: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 3f\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "b 4f\n" + "3:" // Height 1: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" "mov z11.s, #0x0\n" - "5:" // Height 1: setup done - "mov x12, #0x0\n" - "6:" // Height 1: String loop + "4:" // Height 1: setup done + "mov x27, #0x0\n" + "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 7f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 6f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "cbnz x12, 8f\n" + "ldr x25, [x20, #0x0]\n" + "cbnz x27, 7f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "b 8f\n" - "7:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" - "8:" // Height 1: input setup done - "cmp x11, #0x10\n" - "ble 10f\n" - "9:" // Height 1: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "b 7f\n" + "6:" // Height 1: setup direct input + "mov x25, %x[input_ptr]\n" + "7:" // Height 1: input setup done + "cmp x26, #0x10\n" + "ble 9f\n" + "8:" // Height 1: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "cmp x26, #0x10\n" "udot z10.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" + "prfm pldl1keep, [x25, #0x80]\n" "udot z11.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "udot z10.s, z6.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "udot z11.s, z7.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "udot z11.s, z7.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" - "bgt 9b\n" - "10:" // Height 1: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 8b\n" + "9:" // Height 1: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "add x10, x10, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z11.s, z7.b, z0.b[0]\n" - "ble 11f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 10f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "udot z10.s, z6.b, z0.b[1]\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "udot z11.s, z7.b, z0.b[1]\n" - "ble 11f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 10f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" - "addvl x14, x14, #4\n" + "addvl x9, x9, #4\n" "udot z11.s, z7.b, z0.b[2]\n" - "ble 11f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 10f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" - "11:" // Height 1: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" + "10:" // Height 1: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 6b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "12:" // Height 1: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 3b\n" - "b 74f\n" - "13:" // Height 2 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 14f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "add x9, x9, x19, LSL #2\n" - "b 15f\n" - "14:" // Height 2: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "15:" // Height 2: Column loop + "cmp x27, x19\n" + "bne 5b\n" + "st1w { z8.s }, p4, [x28]\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "11:" // Height 1: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 2b\n" + "b 68f\n" + "12:" // Height 2 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "13:" // Height 2: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 16f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "b 17f\n" - "16:" // Height 2: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 14f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "b 15f\n" + "14:" // Height 2: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -279,214 +266,206 @@ void sve_hybrid_u8u32_dot_6x4VL ( "mov z13.s, #0x0\n" "mov z14.s, #0x0\n" "mov z15.s, #0x0\n" - "17:" // Height 2: setup done - "mov x12, #0x0\n" - "18:" // Height 2: String loop + "15:" // Height 2: setup done + "mov x27, #0x0\n" + "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 19f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "cbnz x12, 20f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "cbnz x27, 18f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "b 20f\n" - "19:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "20:" // Height 2: input setup done - "cmp x11, #0x10\n" - "ble 22f\n" - "21:" // Height 2: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "b 18f\n" + "17:" // Height 2: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "18:" // Height 2: input setup done + "cmp x26, #0x10\n" + "ble 20f\n" + "19:" // Height 2: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x28, x28, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "cmp x26, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "prfm pldl1keep, [x24, #0x80]\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" "udot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "udot z8.s, z6.b, z0.b[2]\n" "udot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "udot z8.s, z6.b, z0.b[3]\n" "udot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" "udot z15.s, z7.b, z1.b[3]\n" - "bgt 21b\n" - "22:" // Height 2: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 19b\n" + "20:" // Height 2: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "add x28, x28, #0x10\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z13.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" - "ble 23f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 21f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" - "ble 23f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 21f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" - "ble 23f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 21f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "udot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" "udot z15.s, z7.b, z1.b[3]\n" - "23:" // Height 2: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" + "21:" // Height 2: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 18b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "24:" // Height 2: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 15b\n" - "b 74f\n" - "25:" // Height 3 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 16b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 26f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "add x27, x27, x19, LSL #2\n" - "b 27f\n" - "26:" // Height 3: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "27:" // Height 3: Column loop + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x23]\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "22:" // Height 2: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 13b\n" + "b 68f\n" + "23:" // Height 3 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "24:" // Height 3: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 28f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "b 29f\n" - "28:" // Height 3: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 25f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "b 26f\n" + "25:" // Height 3: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -499,267 +478,257 @@ void sve_hybrid_u8u32_dot_6x4VL ( "mov z17.s, #0x0\n" "mov z18.s, #0x0\n" "mov z19.s, #0x0\n" - "29:" // Height 3: setup done - "mov x12, #0x0\n" - "30:" // Height 3: String loop + "26:" // Height 3: setup done + "mov x27, #0x0\n" + "27:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 31f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 28f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "cbnz x12, 32f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "cbnz x27, 29f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" - "b 32f\n" - "31:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "32:" // Height 3: input setup done - "cmp x11, #0x10\n" - "ble 34f\n" - "33:" // Height 3: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x25, x25, x19\n" + "add x24, x24, x19\n" + "add x23, x23, x19\n" + "b 29f\n" + "28:" // Height 3: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "29:" // Height 3: input setup done + "cmp x26, #0x10\n" + "ble 31f\n" + "30:" // Height 3: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x26, x26, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" - "cmp x11, #0x10\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" + "cmp x26, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" "udot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "prfm pldl1keep, [x23, #0x80]\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" "udot z19.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" "udot z19.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "udot z8.s, z6.b, z0.b[2]\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" "udot z19.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "udot z8.s, z6.b, z0.b[3]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" "udot z15.s, z7.b, z1.b[3]\n" "udot z19.s, z7.b, z2.b[3]\n" - "bgt 33b\n" - "34:" // Height 3: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 30b\n" + "31:" // Height 3: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "add x26, x26, #0x10\n" + "add x23, x23, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "udot z16.s, z6.b, z2.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" "udot z19.s, z7.b, z2.b[0]\n" - "ble 35f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 32f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" "udot z19.s, z7.b, z2.b[1]\n" - "ble 35f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 32f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" "udot z19.s, z7.b, z2.b[2]\n" - "ble 35f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 32f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" "udot z11.s, z7.b, z0.b[3]\n" "udot z15.s, z7.b, z1.b[3]\n" "udot z19.s, z7.b, z2.b[3]\n" - "35:" // Height 3: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "32:" // Height 3: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 30b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "36:" // Height 3: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 27b\n" - "b 74f\n" - "37:" // Height 4 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 27b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 38f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "b 39f\n" - "38:" // Height 4: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "39:" // Height 4: Column loop + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z12.s }, p4, [x23]\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "33:" // Height 3: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 24b\n" + "b 68f\n" + "34:" // Height 4 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "35:" // Height 4: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 40f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "b 41f\n" - "40:" // Height 4: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 36f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "b 37f\n" + "36:" // Height 4: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -776,123 +745,123 @@ void sve_hybrid_u8u32_dot_6x4VL ( "mov z21.s, #0x0\n" "mov z22.s, #0x0\n" "mov z23.s, #0x0\n" - "41:" // Height 4: setup done - "mov x12, #0x0\n" - "42:" // Height 4: String loop + "37:" // Height 4: setup done + "mov x27, #0x0\n" + "38:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 43f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 39f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "cbnz x12, 44f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "cbnz x27, 40f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" - "b 44f\n" - "43:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "44:" // Height 4: input setup done - "cmp x11, #0x10\n" - "ble 46f\n" - "45:" // Height 4: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x23, x23, x19\n" + "add x22, x22, x19\n" + "b 40f\n" + "39:" // Height 4: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "40:" // Height 4: input setup done + "cmp x26, #0x10\n" + "ble 42f\n" + "41:" // Height 4: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x22, x22, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "udot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "udot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "prfm pldl1keep, [x22, #0x80]\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" "udot z22.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" "udot z19.s, z7.b, z2.b[0]\n" "udot z23.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" "udot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" "udot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" "udot z22.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" "udot z19.s, z7.b, z2.b[1]\n" "udot z23.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "udot z8.s, z6.b, z0.b[2]\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" "udot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" "udot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" "udot z22.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" "udot z19.s, z7.b, z2.b[2]\n" "udot z23.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "udot z8.s, z6.b, z0.b[3]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" "udot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" "udot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" @@ -901,31 +870,31 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z15.s, z7.b, z1.b[3]\n" "udot z19.s, z7.b, z2.b[3]\n" "udot z23.s, z7.b, z3.b[3]\n" - "bgt 45b\n" - "46:" // Height 4: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 41b\n" + "42:" // Height 4: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "add x24, x24, #0x10\n" + "add x22, x22, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "udot z17.s, z7.b, z2.b[0]\n" "udot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" @@ -934,21 +903,21 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z15.s, z7.b, z1.b[0]\n" "udot z19.s, z7.b, z2.b[0]\n" "udot z23.s, z7.b, z3.b[0]\n" - "ble 47f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 43f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" "udot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" "udot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" @@ -957,21 +926,21 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z15.s, z7.b, z1.b[1]\n" "udot z19.s, z7.b, z2.b[1]\n" "udot z23.s, z7.b, z3.b[1]\n" - "ble 47f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 43f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" "udot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" "udot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" @@ -980,20 +949,20 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z15.s, z7.b, z1.b[2]\n" "udot z19.s, z7.b, z2.b[2]\n" "udot z23.s, z7.b, z3.b[2]\n" - "ble 47f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 43f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" "udot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" "udot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" @@ -1002,94 +971,82 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z15.s, z7.b, z1.b[3]\n" "udot z19.s, z7.b, z2.b[3]\n" "udot z23.s, z7.b, z3.b[3]\n" - "47:" // Height 4: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "43:" // Height 4: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 42b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "48:" // Height 4: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 39b\n" - "b 74f\n" - "49:" // Height 5 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 38b\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 50f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "b 51f\n" - "50:" // Height 5: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "51:" // Height 5: Column loop + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z12.s }, p4, [x23]\n" + "addvl x28, x28, #4\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "44:" // Height 4: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 35b\n" + "b 68f\n" + "45:" // Height 5 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "46:" // Height 5: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 52f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "b 53f\n" - "52:" // Height 5: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 47f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" + "b 48f\n" + "47:" // Height 5: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -1110,143 +1067,143 @@ void sve_hybrid_u8u32_dot_6x4VL ( "mov z25.s, #0x0\n" "mov z26.s, #0x0\n" "mov z27.s, #0x0\n" - "53:" // Height 5: setup done - "mov x12, #0x0\n" - "54:" // Height 5: String loop + "48:" // Height 5: setup done + "mov x27, #0x0\n" + "49:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 55f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 50f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x12, 56f\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" + "cbnz x27, 51f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" - "b 56f\n" - "55:" // Height 5: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "56:" // Height 5: input setup done - "cmp x11, #0x10\n" - "ble 58f\n" - "57:" // Height 5: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "add x21, x21, x19\n" + "b 51f\n" + "50:" // Height 5: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "51:" // Height 5: input setup done + "cmp x26, #0x10\n" + "ble 53f\n" + "52:" // Height 5: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "udot z13.s, z7.b, z1.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x21, x21, #0x10\n" "udot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "udot z24.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "udot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" "udot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "udot z10.s, z6.b, z0.b[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" "udot z22.s, z6.b, z3.b[0]\n" "udot z26.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" "udot z19.s, z7.b, z2.b[0]\n" "udot z23.s, z7.b, z3.b[0]\n" "udot z27.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" "udot z20.s, z6.b, z3.b[1]\n" "udot z24.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" "udot z21.s, z7.b, z3.b[1]\n" "udot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" "udot z22.s, z6.b, z3.b[1]\n" "udot z26.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" "udot z19.s, z7.b, z2.b[1]\n" "udot z23.s, z7.b, z3.b[1]\n" "udot z27.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "udot z8.s, z6.b, z0.b[2]\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" "udot z20.s, z6.b, z3.b[2]\n" "udot z24.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" "udot z21.s, z7.b, z3.b[2]\n" "udot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" "udot z22.s, z6.b, z3.b[2]\n" "udot z26.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" "udot z19.s, z7.b, z2.b[2]\n" "udot z23.s, z7.b, z3.b[2]\n" "udot z27.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "udot z8.s, z6.b, z0.b[3]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" "udot z20.s, z6.b, z3.b[3]\n" "udot z24.s, z6.b, z4.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" "udot z21.s, z7.b, z3.b[3]\n" "udot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" @@ -1257,35 +1214,35 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z19.s, z7.b, z2.b[3]\n" "udot z23.s, z7.b, z3.b[3]\n" "udot z27.s, z7.b, z4.b[3]\n" - "bgt 57b\n" - "58:" // Height 5: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 52b\n" + "53:" // Height 5: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" - "udot z13.s, z7.b, z1.b[0]\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" "add x22, x22, #0x10\n" + "udot z13.s, z7.b, z1.b[0]\n" + "add x21, x21, #0x10\n" "udot z17.s, z7.b, z2.b[0]\n" "udot z20.s, z6.b, z3.b[0]\n" "udot z24.s, z6.b, z4.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z21.s, z7.b, z3.b[0]\n" "udot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" @@ -1296,23 +1253,23 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z19.s, z7.b, z2.b[0]\n" "udot z23.s, z7.b, z3.b[0]\n" "udot z27.s, z7.b, z4.b[0]\n" - "ble 59f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 54f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" "udot z20.s, z6.b, z3.b[1]\n" "udot z24.s, z6.b, z4.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" "udot z21.s, z7.b, z3.b[1]\n" "udot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" @@ -1323,23 +1280,23 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z19.s, z7.b, z2.b[1]\n" "udot z23.s, z7.b, z3.b[1]\n" "udot z27.s, z7.b, z4.b[1]\n" - "ble 59f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 54f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" "udot z20.s, z6.b, z3.b[2]\n" "udot z24.s, z6.b, z4.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" "udot z21.s, z7.b, z3.b[2]\n" "udot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" @@ -1350,22 +1307,22 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z19.s, z7.b, z2.b[2]\n" "udot z23.s, z7.b, z3.b[2]\n" "udot z27.s, z7.b, z4.b[2]\n" - "ble 59f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 54f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" "udot z20.s, z6.b, z3.b[3]\n" "udot z24.s, z6.b, z4.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" "udot z21.s, z7.b, z3.b[3]\n" "udot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" @@ -1376,109 +1333,96 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z19.s, z7.b, z2.b[3]\n" "udot z23.s, z7.b, z3.b[3]\n" "udot z27.s, z7.b, z4.b[3]\n" - "59:" // Height 5: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "54:" // Height 5: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 54b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "60:" // Height 5: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 51b\n" - "b 74f\n" - "61:" // Height 6 - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "cmp x27, x19\n" + "bne 49b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z12.s }, p4, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "55:" // Height 5: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 46b\n" + "b 68f\n" + "56:" // Height 6 + "ldr x10, [%x[args_ptr], %[offsetof_N]]\n" + "mov x28, %x[output_ptr]\n" + "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x20, #0x18\n" "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" - "tbz %x[flags], #2, 62f\n" - "ldr x13, [%x[output_ptr], #0x0]\n" - "add x13, x13, x19, LSL #2\n" - "ldr x9, [%x[output_ptr], #0x8]\n" - "ldr x27, [%x[output_ptr], #0x10]\n" - "add x9, x9, x19, LSL #2\n" - "ldr x25, [%x[output_ptr], #0x18]\n" - "ldr x23, [%x[output_ptr], #0x20]\n" - "add x27, x27, x19, LSL #2\n" - "ldr x21, [%x[output_ptr], #0x28]\n" - "add %x[output_ptr], %x[output_ptr], #0x30\n" - "add x25, x25, x19, LSL #2\n" - "add x23, x23, x19, LSL #2\n" - "add x21, x21, x19, LSL #2\n" - "b 63f\n" - "62:" // Height 6: setup direct output - "mov x13, %x[output_ptr]\n" - "add x9, x13, x19, LSL #2\n" - "add x27, x9, x19, LSL #2\n" - "add x25, x27, x19, LSL #2\n" - "add x23, x25, x19, LSL #2\n" - "add x21, x23, x19, LSL #2\n" - "add %x[output_ptr], x21, x19, LSL #2\n" - "63:" // Height 6: Column loop + "madd %x[output_ptr], x19, x20, %x[output_ptr]\n" + "57:" // Height 6: Column loop "mov x19, #0x0\n" - "whilelt p4.s, x19, x15\n" + "whilelt p4.s, x19, x10\n" "incw x19\n" - "whilelt p3.s, x19, x15\n" + "whilelt p3.s, x19, x10\n" "incw x19\n" - "whilelt p2.s, x19, x15\n" + "whilelt p2.s, x19, x10\n" "incw x19\n" - "whilelt p1.s, x19, x15\n" - "tbz %x[flags], #0, 64f\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x9]\n" - "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x27]\n" - "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x25]\n" - "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x23]\n" - "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" - "b 65f\n" - "64:" // Height 6: no accumulate + "whilelt p1.s, x19, x10\n" + "tbz %x[flags], #0, 58f\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ld1w { z8.s }, p4/Z, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n" + "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x19]\n" + "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n" + "b 59f\n" + "58:" // Height 6: no accumulate "mov z8.s, #0x0\n" "mov z9.s, #0x0\n" "mov z10.s, #0x0\n" @@ -1503,77 +1447,77 @@ void sve_hybrid_u8u32_dot_6x4VL ( "mov z29.s, #0x0\n" "mov z30.s, #0x0\n" "mov z31.s, #0x0\n" - "65:" // Height 6: setup done - "mov x12, #0x0\n" - "66:" // Height 6: String loop + "59:" // Height 6: setup done + "mov x27, #0x0\n" + "60:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "tbz %x[flags], #3, 67f\n" - "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n" + "ldr w26, [x20, x27, LSL #0x2]\n" + "tbz %x[flags], #3, 61f\n" + "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n" "add x20, x20, x19, LSL #3\n" - "ldr x10, [x20, #0x0]\n" - "ldr x28, [x20, #0x8]\n" - "ldr x26, [x20, #0x10]\n" - "ldr x24, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" + "ldr x25, [x20, #0x0]\n" + "ldr x24, [x20, #0x8]\n" + "ldr x23, [x20, #0x10]\n" + "ldr x22, [x20, #0x18]\n" + "ldr x21, [x20, #0x20]\n" "ldr x20, [x20, #0x28]\n" - "cbnz x12, 68f\n" + "cbnz x27, 62f\n" "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x19\n" - "add x28, x28, x19\n" - "add x26, x26, x19\n" + "add x25, x25, x19\n" "add x24, x24, x19\n" + "add x23, x23, x19\n" "add x22, x22, x19\n" + "add x21, x21, x19\n" "add x20, x20, x19\n" - "b 68f\n" - "67:" // Height 6: setup direct input - "mov x10, %x[input_ptr]\n" - "add x28, x10, x19\n" - "add x26, x28, x19\n" - "add x24, x26, x19\n" - "add x22, x24, x19\n" - "add x20, x22, x19\n" - "68:" // Height 6: input setup done - "cmp x11, #0x10\n" - "ble 70f\n" - "69:" // Height 6: Multiply loop: Main loop head - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "sub x11, x11, #0x10\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "b 62f\n" + "61:" // Height 6: setup direct input + "mov x25, %x[input_ptr]\n" + "add x24, x25, x19\n" + "add x23, x24, x19\n" + "add x22, x23, x19\n" + "add x21, x22, x19\n" + "add x20, x21, x19\n" + "62:" // Height 6: input setup done + "cmp x26, #0x10\n" + "ble 64f\n" + "63:" // Height 6: Multiply loop: Main loop head + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "sub x26, x26, #0x10\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "udot z20.s, z6.b, z3.b[0]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" "add x20, x20, #0x10\n" "udot z24.s, z6.b, z4.b[0]\n" - "prfm pldl1keep, [x28, #0x80]\n" - "cmp x11, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "cmp x26, #0x10\n" "udot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z17.s, z7.b, z2.b[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "udot z21.s, z7.b, z3.b[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "udot z25.s, z7.b, z4.b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" + "udot z25.s, z7.b, z4.b[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" "udot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" "udot z10.s, z6.b, z0.b[0]\n" "prfm pldl1keep, [x20, #0x80]\n" "udot z14.s, z6.b, z1.b[0]\n" @@ -1581,85 +1525,85 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z22.s, z6.b, z3.b[0]\n" "udot z26.s, z6.b, z4.b[0]\n" "udot z30.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n" "udot z11.s, z7.b, z0.b[0]\n" "udot z15.s, z7.b, z1.b[0]\n" "udot z19.s, z7.b, z2.b[0]\n" "udot z23.s, z7.b, z3.b[0]\n" "udot z27.s, z7.b, z4.b[0]\n" "udot z31.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n" "udot z8.s, z6.b, z0.b[1]\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" "udot z20.s, z6.b, z3.b[1]\n" "udot z24.s, z6.b, z4.b[1]\n" "udot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" "udot z21.s, z7.b, z3.b[1]\n" "udot z25.s, z7.b, z4.b[1]\n" "udot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n" - "addvl x14, x14, #16\n" + "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n" + "addvl x9, x9, #16\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" "udot z22.s, z6.b, z3.b[1]\n" "udot z26.s, z6.b, z4.b[1]\n" "udot z30.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n" "udot z11.s, z7.b, z0.b[1]\n" "udot z15.s, z7.b, z1.b[1]\n" "udot z19.s, z7.b, z2.b[1]\n" "udot z23.s, z7.b, z3.b[1]\n" "udot z27.s, z7.b, z4.b[1]\n" "udot z31.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n" "udot z8.s, z6.b, z0.b[2]\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" "udot z20.s, z6.b, z3.b[2]\n" "udot z24.s, z6.b, z4.b[2]\n" "udot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" "udot z21.s, z7.b, z3.b[2]\n" "udot z25.s, z7.b, z4.b[2]\n" "udot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" "udot z22.s, z6.b, z3.b[2]\n" "udot z26.s, z6.b, z4.b[2]\n" "udot z30.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n" "udot z11.s, z7.b, z0.b[2]\n" "udot z15.s, z7.b, z1.b[2]\n" "udot z19.s, z7.b, z2.b[2]\n" "udot z23.s, z7.b, z3.b[2]\n" "udot z27.s, z7.b, z4.b[2]\n" "udot z31.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n" "udot z8.s, z6.b, z0.b[3]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" "udot z20.s, z6.b, z3.b[3]\n" "udot z24.s, z6.b, z4.b[3]\n" "udot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" "udot z21.s, z7.b, z3.b[3]\n" "udot z25.s, z7.b, z4.b[3]\n" "udot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" @@ -1672,39 +1616,39 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z23.s, z7.b, z3.b[3]\n" "udot z27.s, z7.b, z4.b[3]\n" "udot z31.s, z7.b, z5.b[3]\n" - "bgt 69b\n" - "70:" // Height 6: Multiply loop: Single iteration only - "ld1b { z6.b }, p5/Z, [x14]\n" - "whilelt p0.b, XZR, x11\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" - "ld1rqb { z0.b }, p0/Z, [x10]\n" + "bgt 63b\n" + "64:" // Height 6: Multiply loop: Single iteration only + "ld1b { z6.b }, p5/Z, [x9]\n" + "whilelt p0.b, XZR, x26\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "udot z8.s, z6.b, z0.b[0]\n" - "ld1rqb { z1.b }, p0/Z, [x28]\n" - "add x10, x10, #0x10\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "add x25, x25, #0x10\n" "udot z9.s, z7.b, z0.b[0]\n" - "ld1rqb { z2.b }, p0/Z, [x26]\n" - "add x28, x28, #0x10\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "add x24, x24, #0x10\n" "udot z12.s, z6.b, z1.b[0]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "add x26, x26, #0x10\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "add x23, x23, #0x10\n" "udot z16.s, z6.b, z2.b[0]\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "add x24, x24, #0x10\n" + "ld1rqb { z4.b }, p0/Z, [x21]\n" + "add x22, x22, #0x10\n" "udot z13.s, z7.b, z1.b[0]\n" "ld1rqb { z5.b }, p0/Z, [x20]\n" - "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" "udot z20.s, z6.b, z3.b[0]\n" "add x20, x20, #0x10\n" "udot z17.s, z7.b, z2.b[0]\n" "udot z24.s, z6.b, z4.b[0]\n" "udot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z21.s, z7.b, z3.b[0]\n" "udot z25.s, z7.b, z4.b[0]\n" "udot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[0]\n" "udot z14.s, z6.b, z1.b[0]\n" "udot z18.s, z6.b, z2.b[0]\n" @@ -1717,25 +1661,25 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z23.s, z7.b, z3.b[0]\n" "udot z27.s, z7.b, z4.b[0]\n" "udot z31.s, z7.b, z5.b[0]\n" - "ble 71f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 65f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[1]\n" "udot z16.s, z6.b, z2.b[1]\n" "udot z20.s, z6.b, z3.b[1]\n" "udot z24.s, z6.b, z4.b[1]\n" "udot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[1]\n" "udot z13.s, z7.b, z1.b[1]\n" "udot z17.s, z7.b, z2.b[1]\n" "udot z21.s, z7.b, z3.b[1]\n" "udot z25.s, z7.b, z4.b[1]\n" "udot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[1]\n" "udot z14.s, z6.b, z1.b[1]\n" "udot z18.s, z6.b, z2.b[1]\n" @@ -1748,25 +1692,25 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z23.s, z7.b, z3.b[1]\n" "udot z27.s, z7.b, z4.b[1]\n" "udot z31.s, z7.b, z5.b[1]\n" - "ble 71f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 65f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" - "subs x11, x11, #0x4\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" + "subs x26, x26, #0x4\n" "udot z12.s, z6.b, z1.b[2]\n" "udot z16.s, z6.b, z2.b[2]\n" "udot z20.s, z6.b, z3.b[2]\n" "udot z24.s, z6.b, z4.b[2]\n" "udot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[2]\n" "udot z13.s, z7.b, z1.b[2]\n" "udot z17.s, z7.b, z2.b[2]\n" "udot z21.s, z7.b, z3.b[2]\n" "udot z25.s, z7.b, z4.b[2]\n" "udot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[2]\n" "udot z14.s, z6.b, z1.b[2]\n" "udot z18.s, z6.b, z2.b[2]\n" @@ -1779,24 +1723,24 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z23.s, z7.b, z3.b[2]\n" "udot z27.s, z7.b, z4.b[2]\n" "udot z31.s, z7.b, z5.b[2]\n" - "ble 71f\n" - "ld1b { z6.b }, p5/Z, [x14]\n" + "ble 65f\n" + "ld1b { z6.b }, p5/Z, [x9]\n" "udot z8.s, z6.b, z0.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n" "udot z12.s, z6.b, z1.b[3]\n" "udot z16.s, z6.b, z2.b[3]\n" "udot z20.s, z6.b, z3.b[3]\n" "udot z24.s, z6.b, z4.b[3]\n" "udot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n" + "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b[3]\n" "udot z13.s, z7.b, z1.b[3]\n" "udot z17.s, z7.b, z2.b[3]\n" "udot z21.s, z7.b, z3.b[3]\n" "udot z25.s, z7.b, z4.b[3]\n" "udot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n" - "addvl x14, x14, #4\n" + "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" "udot z10.s, z6.b, z0.b[3]\n" "udot z14.s, z6.b, z1.b[3]\n" "udot z18.s, z6.b, z2.b[3]\n" @@ -1809,67 +1753,68 @@ void sve_hybrid_u8u32_dot_6x4VL ( "udot z23.s, z7.b, z3.b[3]\n" "udot z27.s, z7.b, z4.b[3]\n" "udot z31.s, z7.b, z5.b[3]\n" - "71:" // Height 6: Multiply loop: multiply skip - "prfm pldl1keep, [x10, #0x80]\n" - "add x12, x12, #0x1\n" - "prfm pldl1keep, [x28, #0x80]\n" - "prfm pldl1keep, [x26, #0x80]\n" + "65:" // Height 6: Multiply loop: multiply skip + "prfm pldl1keep, [x25, #0x80]\n" + "add x27, x27, #0x1\n" "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" "prfm pldl1keep, [x20, #0x80]\n" "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n" - "cmp x12, x19\n" - "bne 66b\n" - "st1w { z8.s }, p4, [x13]\n" - "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x13, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x13, #3, MUL VL]\n" - "addvl x13, x13, #4\n" - "st1w { z12.s }, p4, [x9]\n" - "st1w { z13.s }, p3, [x9, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x9, #3, MUL VL]\n" - "addvl x9, x9, #4\n" - "st1w { z16.s }, p4, [x27]\n" - "st1w { z17.s }, p3, [x27, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x27, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x27, #3, MUL VL]\n" - "addvl x27, x27, #4\n" - "st1w { z20.s }, p4, [x25]\n" - "st1w { z21.s }, p3, [x25, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x25, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x25, #3, MUL VL]\n" - "addvl x25, x25, #4\n" - "st1w { z24.s }, p4, [x23]\n" - "st1w { z25.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x23, #3, MUL VL]\n" - "addvl x23, x23, #4\n" - "st1w { z28.s }, p4, [x21]\n" - "st1w { z29.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z30.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z31.s }, p1, [x21, #3, MUL VL]\n" - "addvl x21, x21, #4\n" - "72:" // Height 6: Writeback done - "decw x15, ALL, MUL #4\n" - "cmp x15, XZR\n" - "bgt 63b\n" + "cmp x27, x19\n" + "bne 60b\n" + "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n" + "st1w { z8.s }, p4, [x28]\n" + "add x23, x28, x19, LSL #2\n" + "st1w { z9.s }, p3, [x28, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x28, #2, MUL VL]\n" + "add x22, x23, x19, LSL #2\n" + "st1w { z11.s }, p1, [x28, #3, MUL VL]\n" + "add x21, x22, x19, LSL #2\n" + "st1w { z12.s }, p4, [x23]\n" + "add x20, x21, x19, LSL #2\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "add x19, x20, x19, LSL #2\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "addvl x28, x28, #4\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" + "st1w { z28.s }, p4, [x19]\n" + "st1w { z29.s }, p3, [x19, #1, MUL VL]\n" + "st1w { z30.s }, p2, [x19, #2, MUL VL]\n" + "st1w { z31.s }, p1, [x19, #3, MUL VL]\n" + "66:" // Height 6: Writeback done + "decw x10, ALL, MUL #4\n" + "cmp x10, XZR\n" + "bgt 57b\n" "subs %x[M], %x[M], #0x6\n" - "beq 74f\n" + "beq 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 73f\n" + "tbz %x[flags], #3, 67f\n" "add x20, x20, #0x6\n" "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" - "73:" // Update direct input + "67:" // Update direct input "mov x19, #0x6\n" "madd %x[input_ptr], x19, x20, %x[input_ptr]\n" "b 1b\n" - "74:" // Exit + "68:" // Exit : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp index 8fdd2c920d..bea455ca67 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp @@ -22,9 +22,6 @@ * SOFTWARE. */ #pragma once -#if (defined(__GNUC__) && (__GNUC__ >= 7)) -#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" -#endif #ifdef __arm__ -- cgit v1.2.1